diff --git a/crates/ruvector-diskann/src/index.rs b/crates/ruvector-diskann/src/index.rs index 2a40ae23b..11e41271f 100644 --- a/crates/ruvector-diskann/src/index.rs +++ b/crates/ruvector-diskann/src/index.rs @@ -12,6 +12,154 @@ use std::fs::{self, File}; use std::io::{BufWriter, Write}; use std::path::{Path, PathBuf}; +// Sidecar layout for the originals file written next to `vectors.bin` when +// `keep_originals_in_memory == false`. Format is: +// [0..8) magic "DARO0001" (DiskAnn Raw Originals v1) +// [8..16) n (u64 LE) +// [16..24) dim (u64 LE) +// [24..) raw f32 LE slab, n*dim*4 bytes +// +// We keep the existing `vectors.bin` untouched (it has its own (n, dim) +// header at offset 0). The sidecar is identical in body but uses an explicit +// magic so the load path can detect a v2 disk-backed-rerank index without +// reading the JSON config first. When the sidecar isn't present, load falls +// back to the v1 layout (mmap `vectors.bin` and copy into a Vec) for +// back-compat. +const ORIGINALS_MAGIC: &[u8; 8] = b"DARO0001"; +const ORIGINALS_HEADER_BYTES: usize = 24; +const ORIGINALS_FILENAME: &str = "originals.bin"; + +/// Backing store for the original f32 vectors used by the rerank pass. +/// +/// Two variants: +/// - `InMemory(FlatVectors)` — current behavior. Vectors live in DRAM. +/// Identical performance to pre-PR. +/// - `DiskBacked { mmap, n, dim }` — vectors live on disk via memory map. +/// Read by the rerank pass only (top `rerank_factor * k` candidates), +/// so the page cache absorbs most of the cost on subsequent queries. +/// +/// We use an enum (vs `Box`) because there are only ever +/// two variants and the rerank path benefits from monomorphic dispatch on +/// the hot path. `Send + Sync` is automatic — `Vec` and `Mmap` are +/// both already so. +enum OriginalsStore { + InMemory(FlatVectors), + DiskBacked { mmap: Mmap, n: usize, dim: usize }, +} + +impl OriginalsStore { + #[inline] + fn dim(&self) -> usize { + match self { + OriginalsStore::InMemory(fv) => fv.dim, + OriginalsStore::DiskBacked { dim, .. } => *dim, + } + } + + #[inline] + fn len(&self) -> usize { + match self { + OriginalsStore::InMemory(fv) => fv.len(), + OriginalsStore::DiskBacked { n, .. } => *n, + } + } + + /// Read vector at position `pos` into the destination buffer. The buffer + /// length must equal `self.dim()`. The disk-backed path reads from the + /// mmap region (kernel handles page-in lazily); the in-memory path is a + /// straight copy. Either way the returned slice is owned by the caller, + /// so the rerank loop doesn't pin a borrow into the originals. + fn read_into(&self, pos: usize, out: &mut [f32]) { + debug_assert_eq!(out.len(), self.dim()); + match self { + OriginalsStore::InMemory(fv) => { + out.copy_from_slice(fv.get(pos)); + } + OriginalsStore::DiskBacked { mmap, dim, .. } => { + let start = ORIGINALS_HEADER_BYTES + pos * dim * 4; + let end = start + dim * 4; + let bytes = &mmap[start..end]; + // f32 is little-endian on every platform we target. Use + // `bytemuck::cast_slice` to get a safe `&[f32]` view, then + // copy. We don't reinterpret-cast directly into `out` + // because mmap alignment isn't guaranteed at the f32 + // boundary on all platforms. + let view: &[f32] = bytemuck::cast_slice(bytes); + out.copy_from_slice(view); + } + } + } + + /// In-memory heap byte cost (excluding mmap pages, which are kernel-owned + /// and not counted as DRAM in the sense the 17.5× target measures). + /// Used by `originals_memory_bytes()` to demonstrate compression. + fn heap_bytes(&self) -> usize { + match self { + OriginalsStore::InMemory(fv) => fv.data.len() * std::mem::size_of::(), + OriginalsStore::DiskBacked { .. } => 0, + } + } + + /// Convenience: are we paying any DRAM cost for originals? Used by tests + /// to assert the disk-backed path actually evicted them. + #[inline] + fn is_disk_backed(&self) -> bool { + matches!(self, OriginalsStore::DiskBacked { .. }) + } +} + +/// Write the originals sidecar to `/originals.bin`. Mirrors the layout +/// used by `vectors.bin` but with a v2 magic so the load path can tell them +/// apart. Returns the path written so callers can mmap it. +fn write_originals_sidecar(dir: &Path, vectors: &FlatVectors) -> Result { + fs::create_dir_all(dir)?; + let path = dir.join(ORIGINALS_FILENAME); + let mut f = BufWriter::new(File::create(&path)?); + f.write_all(ORIGINALS_MAGIC)?; + f.write_all(&(vectors.len() as u64).to_le_bytes())?; + f.write_all(&(vectors.dim as u64).to_le_bytes())?; + let byte_slice = unsafe { + std::slice::from_raw_parts(vectors.data.as_ptr() as *const u8, vectors.data.len() * 4) + }; + f.write_all(byte_slice)?; + f.flush()?; + Ok(path) +} + +/// Open an existing originals sidecar and return a `DiskBacked` store. The +/// magic is validated; mismatch returns an `InvalidConfig` error rather +/// than a generic I/O error so the caller knows the file is the wrong +/// format, not just absent. +fn open_originals_sidecar(path: &Path) -> Result { + let f = File::open(path)?; + let mmap = unsafe { MmapOptions::new().map(&f)? }; + if mmap.len() < ORIGINALS_HEADER_BYTES { + return Err(DiskAnnError::InvalidConfig(format!( + "originals sidecar at {} is truncated ({} bytes)", + path.display(), + mmap.len() + ))); + } + if &mmap[0..8] != ORIGINALS_MAGIC { + return Err(DiskAnnError::InvalidConfig(format!( + "originals sidecar at {} has wrong magic", + path.display() + ))); + } + let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize; + let dim = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize; + let expected = ORIGINALS_HEADER_BYTES + n * dim * 4; + if mmap.len() < expected { + return Err(DiskAnnError::InvalidConfig(format!( + "originals sidecar at {} is truncated: header says {} bytes, file is {}", + path.display(), + expected, + mmap.len() + ))); + } + Ok(OriginalsStore::DiskBacked { mmap, n, dim }) +} + /// Which quantizer backend a [`DiskAnnIndex`] should use during search. /// /// We use an enum rather than a generic type parameter on the index for two @@ -139,11 +287,17 @@ impl DiskAnnConfig { } /// Builder-style override for whether to keep f32 originals in DRAM. - /// Today the rerank path *requires* originals (we don't read them from - /// disk yet), so `false` is rejected at `build()` time with an error - /// rather than silently degrading recall. The plumbing is in place so a - /// follow-up PR can wire mmap-backed reranking without another API - /// break. + /// + /// When `false`, after `build()` the originals are written to a sidecar + /// (`/originals.bin`), the in-memory `FlatVectors` is + /// dropped, and the rerank pass reads originals back via mmap. Net DRAM + /// drops to (codes + graph) only — for D=128 RaBitQ that's ~25× smaller + /// than keeping f32 originals resident, hitting the 17.5× target from + /// the research roadmap. + /// + /// Requires `storage_path` to be set; otherwise `build()` returns + /// `InvalidConfig`. Disk-backed rerank produces byte-identical results + /// to in-memory rerank (same f32 values, same float arithmetic). pub fn with_originals_in_memory(mut self, keep: bool) -> Self { self.keep_originals_in_memory = keep; self @@ -184,13 +338,17 @@ impl DiskAnnConfig { /// vectors — that's intentional, the codes are an approximation. pub struct DiskAnnIndex { config: DiskAnnConfig, - /// Flat contiguous vector storage (cache-friendly). - /// - /// Held in DRAM today even when a quantizer is active, so the rerank - /// pass can compute exact distances on the candidate pool. The - /// `keep_originals_in_memory(false)` knob is wired into the config but - /// rejected at `build()` time pending the disk-backed rerank follow-up. - vectors: FlatVectors, + /// Pre-build staging area for inserts. Always in-memory (graph + /// construction needs random f32 access). After `build()` runs, this + /// is **either** kept as the search-time originals (in-memory mode) + /// **or** flushed to a sidecar and replaced with a mmap-backed reader + /// in `originals` (disk-backed mode). In the disk-backed case + /// `staging` is dropped entirely and search-time DRAM = codes + graph. + staging: Option, + /// Search-time originals store. Set during `build()` (or `load()`). + /// `None` only between `new()` and `build()`. The rerank pass reads + /// from this; graph construction reads from `staging`. + originals: Option, /// ID mapping: internal index -> external string ID id_map: Vec, /// Reverse mapping: external ID -> internal index @@ -208,7 +366,9 @@ pub struct DiskAnnIndex { built: bool, /// Reusable visited set for search (avoids per-query allocation) visited: Option, - /// Memory-mapped vector data (for large datasets) + /// Memory-mapped vector data (legacy v1 load path). Held to keep the + /// mmap alive for the duration of the index. The new disk-backed + /// originals path stores its mmap inside `originals` instead. mmap: Option, } @@ -218,7 +378,8 @@ impl DiskAnnIndex { let dim = config.dim; Self { config, - vectors: FlatVectors::new(dim), + staging: Some(FlatVectors::new(dim)), + originals: None, id_map: Vec::new(), id_reverse: HashMap::new(), graph: None, @@ -241,11 +402,18 @@ impl DiskAnnIndex { if self.id_reverse.contains_key(&id) { return Err(DiskAnnError::InvalidConfig(format!("Duplicate ID: {id}"))); } - - let idx = self.vectors.len() as u32; + // Inserts after build() are not currently supported; re-attach a + // staging buffer if the index was loaded or built. The pre-existing + // behavior just clobbered `built = false` and let the next build() + // recompute, so we preserve that. + if self.staging.is_none() { + self.staging = Some(FlatVectors::new(self.config.dim)); + } + let staging = self.staging.as_mut().unwrap(); + let idx = staging.len() as u32; self.id_reverse.insert(id.clone(), idx); self.id_map.push(id); - self.vectors.push(&vector); + staging.push(&vector); self.built = false; Ok(()) } @@ -260,17 +428,18 @@ impl DiskAnnIndex { /// Build the index (must be called after all inserts, before search) pub fn build(&mut self) -> Result<()> { - let n = self.vectors.len(); + let staging = self.staging.as_ref().ok_or(DiskAnnError::Empty)?; + let n = staging.len(); if n == 0 { return Err(DiskAnnError::Empty); } - // Disk-backed rerank isn't wired yet — bail early rather than - // silently dropping originals and degrading recall to garbage. - if !self.config.keep_originals_in_memory { + // Disk-backed rerank requires a place to spill the originals to. + // Reject early with a clear message rather than silently degrading. + if !self.config.keep_originals_in_memory && self.config.storage_path.is_none() { return Err(DiskAnnError::InvalidConfig( - "keep_originals_in_memory=false requires the disk-backed rerank path; \ - not yet implemented — keep originals in DRAM for this PR" + "keep_originals_in_memory=false requires storage_path to be set \ + (originals are written to /originals.bin and mmapped back)" .into(), )); } @@ -296,7 +465,7 @@ impl DiskAnnIndex { )); } // Collect vectors for PQ training - let vecs: Vec> = (0..n).map(|i| self.vectors.get(i).to_vec()).collect(); + let vecs: Vec> = (0..n).map(|i| staging.get(i).to_vec()).collect(); let mut pq = ProductQuantizer::new(self.config.dim, m)?; Quantizer::train(&mut pq, &vecs, self.config.pq_iterations)?; @@ -309,7 +478,7 @@ impl DiskAnnIndex { } #[cfg(feature = "rabitq")] QuantizerKind::Rabitq => { - let vecs: Vec> = (0..n).map(|i| self.vectors.get(i).to_vec()).collect(); + let vecs: Vec> = (0..n).map(|i| staging.get(i).to_vec()).collect(); let mut rb = RabitqQuantizer::new(self.config.dim, self.config.rabitq_seed); Quantizer::train(&mut rb, &vecs, 0)?; @@ -332,17 +501,33 @@ impl DiskAnnIndex { self.config.build_beam, self.config.alpha, ); - graph.build(&self.vectors)?; + graph.build(staging)?; self.graph = Some(graph); // Pre-allocate visited set for search self.visited = Some(VisitedSet::new(n)); self.built = true; + // Persist before we move staging into originals: save() needs to + // read the f32 slab to write `vectors.bin`. Once that's done we + // either keep `staging` as the in-memory originals (default) or + // spill it to the sidecar and drop it (disk-backed mode). if let Some(ref path) = self.config.storage_path { self.save(path)?; } + let staging = self.staging.take().unwrap(); + if self.config.keep_originals_in_memory { + self.originals = Some(OriginalsStore::InMemory(staging)); + } else { + // storage_path is guaranteed Some by the check above. + let dir = self.config.storage_path.as_ref().unwrap(); + let sidecar_path = write_originals_sidecar(dir, &staging)?; + // staging is dropped here — DRAM cost of originals goes to 0. + drop(staging); + self.originals = Some(open_originals_sidecar(&sidecar_path)?); + } + Ok(()) } @@ -370,21 +555,45 @@ impl DiskAnnIndex { } let graph = self.graph.as_ref().unwrap(); + let originals = self.originals.as_ref().ok_or(DiskAnnError::NotBuilt)?; let beam = self.config.search_beam.max(k); - let n = self.vectors.len(); + let n = originals.len(); // Phase 1: graph traversal. Distance source depends on the // configured quantizer. Each match arm calls // `greedy_search_with_codes` with a closure that's monomorphic to // the concrete quantizer — the trait dispatch happens once outside // the hot loop, not per node. + // + // The legacy `QuantizerBackend::None` path needs f32 originals + // during traversal. When originals are disk-backed, we route it + // through `greedy_search_with_codes` with a per-node mmap read + // closure; this keeps the disk-backed mode functional even without + // a quantizer (mostly useful for tests / regression checks). When + // originals are in-memory we keep the old `greedy_search` call to + // stay bit-stable with the pre-PR benchmark numbers. let candidates: Vec = match &self.quantizer { - QuantizerBackend::None => { - // Legacy path — exact f32 distance during traversal. Keeps - // old benchmarks bit-stable. - let (cands, _) = graph.greedy_search(&self.vectors, query, beam); - cands - } + QuantizerBackend::None => match originals { + OriginalsStore::InMemory(fv) => { + let (cands, _) = graph.greedy_search(fv, query, beam); + cands + } + OriginalsStore::DiskBacked { .. } => { + // Per-node f32 read from mmap. Allocates one scratch + // buffer and reuses it across the closure invocations + // via interior mutability — but the closure signature + // here is `Fn`, so we push the scratch into a `Cell` + // alternative: stack-buffer per call. The latter is + // simpler and the rerank path dominates anyway. + let dim = self.config.dim; + let (cands, _) = graph.greedy_search_with_codes(n, beam, |id| { + let mut scratch = vec![0.0f32; dim]; + originals.read_into(id as usize, &mut scratch); + l2_squared(&scratch, query) + }); + cands + } + }, QuantizerBackend::Pq(pq) => { let prep = pq.prepare_query(query)?; let codes = &self.codes; @@ -409,11 +618,19 @@ impl DiskAnnIndex { // original f32 vectors. When no quantizer is active the candidates // are already exact-distance ordered, but we still re-sort to keep // the codepath uniform. + // + // Originals are read through `OriginalsStore` so the disk-backed + // path Just Works. Single scratch buffer reused across the + // candidate sweep — the whole point is to not allocate per node. let rerank_pool = (self.config.rerank_factor.max(1) * k).min(candidates.len()); + let mut scratch = vec![0.0f32; self.config.dim]; let mut scored: Vec<(u32, f32)> = candidates .into_iter() .take(rerank_pool) - .map(|id| (id, l2_squared(self.vectors.get(id as usize), query))) + .map(|id| { + originals.read_into(id as usize, &mut scratch); + (id, l2_squared(&scratch, query)) + }) .collect(); scored.sort_unstable_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)); @@ -429,13 +646,38 @@ impl DiskAnnIndex { /// Get the number of vectors in the index pub fn count(&self) -> usize { - self.vectors.len() + // After build(), originals carries the count. Before build(), the + // staging buffer does. Falls through to 0 for an empty index. + if let Some(o) = &self.originals { + o.len() + } else if let Some(s) = &self.staging { + s.len() + } else { + 0 + } } - /// Delete a vector by ID (marks as deleted, doesn't rebuild graph) + /// Delete a vector by ID (marks as deleted, doesn't rebuild graph). + /// + /// Only supported when originals are in-memory — the disk-backed mode + /// would require writing through the mmap, which we don't do (and + /// which would also break determinism guarantees with concurrent + /// readers). Disk-backed callers must rebuild to delete. pub fn delete(&mut self, id: &str) -> Result { if let Some(&idx) = self.id_reverse.get(id) { - self.vectors.zero_out(idx as usize); + match self.originals.as_mut() { + Some(OriginalsStore::InMemory(fv)) => fv.zero_out(idx as usize), + Some(OriginalsStore::DiskBacked { .. }) => { + return Err(DiskAnnError::InvalidConfig( + "delete() is not supported on disk-backed indexes; rebuild instead".into(), + )); + } + None => { + if let Some(s) = self.staging.as_mut() { + s.zero_out(idx as usize); + } + } + } self.id_reverse.remove(id); Ok(true) } else { @@ -447,21 +689,39 @@ impl DiskAnnIndex { pub fn save(&self, dir: &Path) -> Result<()> { fs::create_dir_all(dir)?; + // The flat-vector source: prefer `staging` (set during build, before + // we hand off to `originals`), fall back to `originals` (set after a + // load + re-save flow). Both expose dim and per-vector slice access. + // Disk-backed save path uses `originals.read_into` to stream from + // mmap — slightly slower than the in-memory copy but rare (saves + // happen at build-time, not per-query). + let n = self.count(); + let dim = self.config.dim; + // Save vectors as flat binary (already contiguous — mmap-friendly) let vec_path = dir.join("vectors.bin"); let mut f = BufWriter::new(File::create(&vec_path)?); - let n = self.vectors.len() as u64; - let dim = self.config.dim as u64; - f.write_all(&n.to_le_bytes())?; - f.write_all(&dim.to_le_bytes())?; - // Write flat slab directly — zero copy - let byte_slice = unsafe { - std::slice::from_raw_parts( - self.vectors.data.as_ptr() as *const u8, - self.vectors.data.len() * 4, - ) - }; - f.write_all(byte_slice)?; + f.write_all(&(n as u64).to_le_bytes())?; + f.write_all(&(dim as u64).to_le_bytes())?; + + if let Some(s) = self.staging.as_ref() { + // Hot path: build() is calling us, the slab is contiguous in DRAM. + let byte_slice = unsafe { + std::slice::from_raw_parts(s.data.as_ptr() as *const u8, s.data.len() * 4) + }; + f.write_all(byte_slice)?; + } else if let Some(o) = self.originals.as_ref() { + // Cold path: re-saving a loaded index. Stream vector-by-vector. + // Disk-backed reads tap the mmap; in-memory reads tap the Vec. + let mut scratch = vec![0.0f32; dim]; + for i in 0..n { + o.read_into(i, &mut scratch); + let byte_slice = unsafe { + std::slice::from_raw_parts(scratch.as_ptr() as *const u8, scratch.len() * 4) + }; + f.write_all(byte_slice)?; + } + } f.flush()?; // Save graph adjacency @@ -515,7 +775,10 @@ impl DiskAnnIndex { } } - // Save config + // Save config. The `keep_originals_in_memory` flag is persisted so + // load() can pick the right originals backing without the caller + // having to re-specify. `rerank_factor` is persisted for the same + // reason — it affects search behaviour, not just construction. let config_path = dir.join("config.json"); let config_json = serde_json::json!({ "dim": self.config.dim, @@ -524,7 +787,9 @@ impl DiskAnnIndex { "search_beam": self.config.search_beam, "alpha": self.config.alpha, "pq_subspaces": self.config.pq_subspaces, - "count": self.vectors.len(), + "rerank_factor": self.config.rerank_factor, + "keep_originals_in_memory": self.config.keep_originals_in_memory, + "count": n, "built": self.built, }); fs::write( @@ -548,6 +813,13 @@ impl DiskAnnIndex { let search_beam = config_json["search_beam"].as_u64().unwrap() as usize; let alpha = config_json["alpha"].as_f64().unwrap() as f32; let pq_subspaces = config_json["pq_subspaces"].as_u64().unwrap_or(0) as usize; + // `rerank_factor` and `keep_originals_in_memory` are new in v2; old + // saves don't have them, so default to the v1 behavior (factor=4, + // originals in memory). + let rerank_factor = config_json["rerank_factor"].as_u64().unwrap_or(4) as usize; + let keep_in_memory = config_json["keep_originals_in_memory"] + .as_bool() + .unwrap_or(true); let config = DiskAnnConfig { dim, @@ -556,31 +828,50 @@ impl DiskAnnIndex { search_beam, alpha, pq_subspaces, + rerank_factor, + keep_originals_in_memory: keep_in_memory, storage_path: Some(dir.to_path_buf()), ..Default::default() }; - // Load vectors via mmap - let vec_file = File::open(dir.join("vectors.bin"))?; - let mmap = unsafe { MmapOptions::new().map(&vec_file)? }; - - let n = u64::from_le_bytes(mmap[0..8].try_into().unwrap()) as usize; - let file_dim = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize; - assert_eq!(file_dim, dim); - - // Load vectors directly into flat slab from mmap - let data_start = 16; - let total_floats = n * dim; - let mut flat_data = Vec::with_capacity(total_floats); - let byte_slice = &mmap[data_start..data_start + total_floats * 4]; - // Safe: f32 from le bytes - for chunk in byte_slice.chunks_exact(4) { - flat_data.push(f32::from_le_bytes(chunk.try_into().unwrap())); - } - let vectors = FlatVectors { - data: flat_data, - dim, - count: n, + // Decide which originals backing to use: + // - If `originals.bin` sidecar exists *and* the saved config asked + // for disk-backed mode, mmap it. (Sidecar+in-memory mode is + // possible if the index was saved disk-backed and is being + // reloaded; we honor the saved mode.) + // - Else fall back to v1: read `vectors.bin` into a Vec. + // + // This keeps v1 indexes loading byte-identically while letting v2 + // indexes skip the heap copy entirely. + let sidecar_path = dir.join(ORIGINALS_FILENAME); + let (originals, mmap_for_v1, n) = if sidecar_path.exists() && !keep_in_memory { + let store = open_originals_sidecar(&sidecar_path)?; + let n = store.len(); + (store, None, n) + } else { + // v1 path. Mmap vectors.bin, copy into a Vec, wrap in InMemory. + let vec_file = File::open(dir.join("vectors.bin"))?; + let mmap = unsafe { MmapOptions::new().map(&vec_file)? }; + + let n = u64::from_le_bytes(mmap[0..8].try_into().unwrap()) as usize; + let file_dim = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize; + assert_eq!(file_dim, dim); + + // Load vectors directly into flat slab from mmap + let data_start = 16; + let total_floats = n * dim; + let mut flat_data = Vec::with_capacity(total_floats); + let byte_slice = &mmap[data_start..data_start + total_floats * 4]; + // Safe: f32 from le bytes + for chunk in byte_slice.chunks_exact(4) { + flat_data.push(f32::from_le_bytes(chunk.try_into().unwrap())); + } + let vectors = FlatVectors { + data: flat_data, + dim, + count: n, + }; + (OriginalsStore::InMemory(vectors), Some(mmap), n) }; // Load IDs @@ -654,7 +945,8 @@ impl DiskAnnIndex { Ok(Self { config, - vectors, + staging: None, + originals: Some(originals), id_map, id_reverse, graph: Some(graph), @@ -662,7 +954,7 @@ impl DiskAnnIndex { codes, built: true, visited: Some(VisitedSet::new(n)), - mmap: Some(mmap), + mmap: mmap_for_v1, }) } @@ -673,10 +965,33 @@ impl DiskAnnIndex { self.codes.iter().map(|c| c.len()).sum() } - /// Number of bytes the f32 originals consume in DRAM. Pair with - /// [`Self::codes_memory_bytes`] to compute the compression ratio. + /// Number of bytes the f32 originals consume **in DRAM** (heap). Pair + /// with [`Self::codes_memory_bytes`] to compute the compression ratio. + /// + /// Importantly, this returns 0 when originals are disk-backed — mmap + /// pages are kernel-owned, only paged in on demand for the rerank + /// candidates, and don't count toward DRAM in the sense the 17.5× + /// compression target measures. pub fn originals_memory_bytes(&self) -> usize { - self.vectors.data.len() * std::mem::size_of::() + // staging is only set during build; originals after. Sum both for + // safety, though only one is ever set at a time. + let staging_bytes = self + .staging + .as_ref() + .map(|s| s.data.len() * std::mem::size_of::()) + .unwrap_or(0); + let originals_bytes = self.originals.as_ref().map(|o| o.heap_bytes()).unwrap_or(0); + staging_bytes + originals_bytes + } + + /// True when originals are disk-backed (read via mmap on the rerank + /// path) rather than DRAM-resident. Used by tests / external tooling + /// to verify the disk-backed compression mode is actually engaged. + pub fn originals_on_disk(&self) -> bool { + self.originals + .as_ref() + .map(|o| o.is_disk_backed()) + .unwrap_or(false) } /// Which quantizer this index is using. diff --git a/crates/ruvector-diskann/tests/disk_backed_rerank.rs b/crates/ruvector-diskann/tests/disk_backed_rerank.rs new file mode 100644 index 000000000..3a69263b7 --- /dev/null +++ b/crates/ruvector-diskann/tests/disk_backed_rerank.rs @@ -0,0 +1,411 @@ +//! Acceptance test for the disk-backed rerank path landed by this PR. +//! +//! Closes the deferral from PR #384: previously `with_originals_in_memory(false)` +//! was rejected at `build()` time. After this PR, the originals are written to +//! a sidecar (`/originals.bin`), the in-memory `FlatVectors` is +//! dropped, and the rerank pass reads originals back via mmap. Net DRAM drops +//! to (codes + graph) only — the 17.5× compression target the research +//! roadmap projected. +//! +//! What this test verifies: +//! +//! 1. `with_originals_in_memory(false)` is honored — the index reports +//! `originals_on_disk() == true` and `originals_memory_bytes() == 0`. +//! 2. Recall@10 is still ≥ 0.85 on the same dataset shape PR #384 used. +//! 3. Codes-vs-originals memory ratio reflects the codes savings: at +//! D=128 the in-memory variant should give >16× compression of codes +//! relative to originals, and the disk-backed variant should give +//! *infinite* compression (originals heap = 0). +//! 4. Save → drop → load round-trip preserves search results +//! bit-identically (ADR-154 determinism). +//! 5. Without `storage_path` set, `build()` returns `InvalidConfig` +//! when `with_originals_in_memory(false)` is requested. +//! +//! `rabitq` feature gate matches the existing integration test in +//! `quantizer_search_uses_codes.rs` — the disk-backed path itself is +//! feature-agnostic, but the test uses RaBitQ for code-side compression +//! parity with PR #384's measurements. +#![cfg(feature = "rabitq")] + +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use ruvector_diskann::{DiskAnnConfig, DiskAnnError, DiskAnnIndex, QuantizerKind}; +use tempfile::tempdir; + +fn random_unit_vectors(n: usize, dim: usize, seed: u64) -> Vec> { + let mut rng = StdRng::seed_from_u64(seed); + (0..n) + .map(|_| { + let v: Vec = (0..dim).map(|_| rng.gen::() * 2.0 - 1.0).collect(); + let norm: f32 = v.iter().map(|x| x * x).sum::().sqrt().max(1e-10); + v.into_iter().map(|x| x / norm).collect() + }) + .collect() +} + +fn brute_force_topk(vectors: &[Vec], query: &[f32], k: usize) -> Vec { + let mut scored: Vec<(usize, f32)> = vectors + .iter() + .enumerate() + .map(|(i, v)| { + let d: f32 = v.iter().zip(query).map(|(a, b)| (a - b) * (a - b)).sum(); + (i, d) + }) + .collect(); + scored.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + scored.into_iter().take(k).map(|(i, _)| i).collect() +} + +/// Helper: build a DiskAnnIndex with the given originals mode and a fresh +/// storage_path inside the supplied tempdir. Returns the storage path so +/// callers can drop+reload without re-deriving it. +fn build_index( + dir: &std::path::Path, + vectors: &[Vec], + keep_in_memory: bool, + rerank_factor: usize, +) -> (DiskAnnIndex, std::path::PathBuf) { + let dim = vectors[0].len(); + let storage = dir.join("idx"); + let config = DiskAnnConfig { + dim, + max_degree: 64, + build_beam: 256, + search_beam: 512, + alpha: 1.2, + storage_path: Some(storage.clone()), + ..Default::default() + } + .with_rabitq_seed(0xBEEF) + .with_quantizer_kind(QuantizerKind::Rabitq) + .with_rerank_factor(rerank_factor) + .with_originals_in_memory(keep_in_memory); + + let mut index = DiskAnnIndex::new(config); + let entries: Vec<(String, Vec)> = vectors + .iter() + .enumerate() + .map(|(i, v)| (format!("v{i}"), v.clone())) + .collect(); + index.insert_batch(entries).unwrap(); + index.build().unwrap(); + (index, storage) +} + +#[test] +fn disk_backed_originals_drop_dram_to_zero() { + // Smaller dataset (D=128, n=2000) keeps wall-time under control while + // still hitting the 17.5× target from PR #384's measurements. + let dim = 128; + let n = 2_000; + let vectors = random_unit_vectors(n, dim, 7); + + let dir = tempdir().unwrap(); + + // ---- In-memory variant (baseline) --------------------------------- + let (mem_index, _) = build_index(&dir.path().join("mem"), &vectors, true, 8); + let mem_codes = mem_index.codes_memory_bytes(); + let mem_orig = mem_index.originals_memory_bytes(); + assert!( + !mem_index.originals_on_disk(), + "in-memory mode flagged on-disk" + ); + assert!( + mem_orig > 0, + "in-memory variant should hold originals in DRAM" + ); + + // ---- Disk-backed variant ------------------------------------------ + let (disk_index, _) = build_index(&dir.path().join("disk"), &vectors, false, 8); + let disk_codes = disk_index.codes_memory_bytes(); + let disk_orig = disk_index.originals_memory_bytes(); + + // The disk-backed variant must report 0 DRAM bytes for originals — the + // mmap pages are kernel-owned and only paged in on demand. + assert_eq!( + disk_orig, 0, + "disk-backed originals should not occupy DRAM (got {disk_orig} bytes)" + ); + assert!( + disk_index.originals_on_disk(), + "disk-backed index should report originals_on_disk() == true" + ); + // Codes slab should be unchanged between the two variants — the + // originals knob doesn't touch the quantizer. + assert_eq!( + disk_codes, mem_codes, + "codes slab differs between in-memory and disk-backed variants \ + (disk={disk_codes} mem={mem_codes})" + ); + + // ---- Compression ratio -------------------------------------------- + // f32 originals at D=128 occupy 512B/vec. Codes are 16+4=20B/vec for + // RaBitQ at this dim. In-memory variant compresses by ~25×; disk-backed + // variant goes to "0 DRAM cost" because originals leave DRAM entirely. + let in_mem_ratio = mem_orig as f32 / mem_codes as f32; + eprintln!( + "[disk-backed mem] codes={disk_codes}B originals_dram={disk_orig}B \ + (vs in-mem variant: codes={mem_codes}B originals={mem_orig}B, ratio={in_mem_ratio:.2}×)" + ); + assert!( + in_mem_ratio >= 16.0, + "in-memory codes-vs-originals ratio {in_mem_ratio:.2}× < 16× target \ + (codes={mem_codes}B, originals={mem_orig}B)" + ); +} + +#[test] +fn disk_backed_search_meets_recall_floor() { + // Same recall floor (≥ 0.85) PR #384 set for the in-memory variant; + // the disk-backed path produces byte-identical f32 reads, so recall + // must match. + let dim = 128; + let n = 1_000; + let k = 10; + let vectors = random_unit_vectors(n, dim, 0xC0DE_C0DE); + + let dir = tempdir().unwrap(); + let (index, _) = build_index(&dir.path(), &vectors, false, 40); + + let queries = random_unit_vectors(30, dim, 0xACE); + let mut recall_sum = 0.0f32; + for query in &queries { + let gt: std::collections::HashSet = + brute_force_topk(&vectors, query, k).into_iter().collect(); + let results = index.search(query, k).unwrap(); + let found: std::collections::HashSet = results + .iter() + .map(|r| { + r.id.trim_start_matches('v') + .parse::() + .expect("v-prefixed id") + }) + .collect(); + let recall = gt.intersection(&found).count() as f32 / k as f32; + recall_sum += recall; + } + let avg_recall = recall_sum / queries.len() as f32; + eprintln!("[disk-backed recall] recall@{k} = {avg_recall:.3}"); + assert!( + avg_recall >= 0.85, + "disk-backed RaBitQ recall@{k} = {avg_recall:.3} < 0.85 floor" + ); +} + +#[test] +fn disk_backed_in_memory_results_match_bitwise() { + // Determinism guard: the f32 bytes the disk-backed rerank reads from + // mmap must be bit-identical to the bytes the in-memory rerank reads + // from a Vec — both come from the same `FlatVectors` write path. We + // verify by: + // 1. building a disk-backed index (originals → sidecar via mmap) + // 2. dropping it and reloading via `DiskAnnIndex::load` + // 3. patching the saved config to flip `keep_originals_in_memory` to + // true and removing the sidecar, then loading again as in-memory + // (which reads vectors.bin into a Vec) + // 4. the codes file (PQ; chosen because RaBitQ codes don't persist + // yet — see `index.rs::save()` notes) is identical, so traversal + // is identical, so the candidate set is identical, so any + // difference must come from rerank arithmetic. + // + // Both load paths read from the same on-disk f32 bytes, so the rerank + // distances must agree bit-for-bit. If this test ever fires, mmap is + // being read with the wrong endianness/alignment. + let dim = 64; + let n = 500; + let k = 10; + let vectors = random_unit_vectors(n, dim, 0xDEAD); + + let dir = tempdir().unwrap(); + let storage = dir.path().join("idx"); + + // PQ-backed disk-backed build. PQ codes persist via save(), so a load + // round-trip preserves the traversal hot path. + let config = DiskAnnConfig { + dim, + max_degree: 32, + build_beam: 96, + search_beam: 96, + alpha: 1.2, + pq_subspaces: 8, + pq_iterations: 5, + storage_path: Some(storage.clone()), + ..Default::default() + } + .with_rerank_factor(8) + .with_originals_in_memory(false); + + let mut idx = DiskAnnIndex::new(config); + let entries: Vec<(String, Vec)> = vectors + .iter() + .enumerate() + .map(|(i, v)| (format!("v{i}"), v.clone())) + .collect(); + idx.insert_batch(entries).unwrap(); + idx.build().unwrap(); + assert!(idx.originals_on_disk()); + + // Snapshot disk-backed search results, then drop the index. + let queries = random_unit_vectors(10, dim, 0x1234); + let disk_results: Vec> = queries + .iter() + .map(|q| { + idx.search(q, k) + .unwrap() + .into_iter() + .map(|r| (r.id, r.distance.to_bits())) + .collect() + }) + .collect(); + drop(idx); + + // Convert the saved index to the in-memory layout by removing the + // sidecar and flipping the saved config flag. The graph + PQ codes + // are unchanged, so traversal results match the disk-backed run. + std::fs::remove_file(storage.join("originals.bin")).unwrap(); + let cfg_path = storage.join("config.json"); + let mut cfg: serde_json::Value = + serde_json::from_str(&std::fs::read_to_string(&cfg_path).unwrap()).unwrap(); + cfg["keep_originals_in_memory"] = serde_json::json!(true); + std::fs::write(&cfg_path, serde_json::to_string_pretty(&cfg).unwrap()).unwrap(); + + let mem_index = DiskAnnIndex::load(&storage).unwrap(); + assert!( + !mem_index.originals_on_disk(), + "expected in-memory after sidecar removal" + ); + + for (q, want) in queries.iter().zip(disk_results.iter()) { + let got: Vec<(String, u32)> = mem_index + .search(q, k) + .unwrap() + .into_iter() + .map(|r| (r.id, r.distance.to_bits())) + .collect(); + assert_eq!( + got, *want, + "rerank arithmetic differs between disk-backed and in-memory paths" + ); + } +} + +#[test] +fn disk_backed_save_load_round_trip_preserves_results() { + // Build with disk-backed mode → save → drop → load → search must + // match the pre-drop search results exactly. This catches bugs where + // the load path silently re-reads `vectors.bin` as in-memory, which + // would still give correct results but break the compression target. + // + // Uses PQ rather than RaBitQ because PQ codes persist via save() — + // RaBitQ persistence is a future PR (see save() comments in + // index.rs). Without that, a loaded RaBitQ-built index runs the + // legacy f32 path and can't be compared to the pre-drop run. + let dim = 64; + let n = 300; + let k = 5; + let vectors = random_unit_vectors(n, dim, 0xABCD); + + let dir = tempdir().unwrap(); + let storage = dir.path().join("idx"); + + let config = DiskAnnConfig { + dim, + max_degree: 32, + build_beam: 64, + search_beam: 64, + alpha: 1.2, + pq_subspaces: 8, + pq_iterations: 5, + storage_path: Some(storage.clone()), + ..Default::default() + } + .with_rerank_factor(4) + .with_originals_in_memory(false); + + // Snapshot search results before drop. + let queries = random_unit_vectors(8, dim, 0x9999); + let pre_results: Vec> = { + let mut idx = DiskAnnIndex::new(config); + let entries: Vec<(String, Vec)> = vectors + .iter() + .enumerate() + .map(|(i, v)| (format!("v{i}"), v.clone())) + .collect(); + idx.insert_batch(entries).unwrap(); + idx.build().unwrap(); + assert!(idx.originals_on_disk()); + queries + .iter() + .map(|q| { + idx.search(q, k) + .unwrap() + .into_iter() + .map(|r| (r.id, r.distance.to_bits())) + .collect() + }) + .collect() + // idx dropped here + }; + + // Reload from disk — the originals.bin sidecar should be picked up + // automatically because the saved config has keep_originals_in_memory = + // false. + let loaded = DiskAnnIndex::load(&storage).unwrap(); + assert!( + loaded.originals_on_disk(), + "loaded index should detect originals.bin sidecar and stay disk-backed" + ); + for (q, want) in queries.iter().zip(pre_results.iter()) { + let got: Vec<(String, u32)> = loaded + .search(q, k) + .unwrap() + .into_iter() + .map(|r| (r.id, r.distance.to_bits())) + .collect(); + assert_eq!( + got, *want, + "search results changed across save → load round-trip" + ); + } +} + +#[test] +fn disk_backed_without_storage_path_rejected() { + // The whole point of storage_path is "where do I spill the originals". + // Without it, disk-backed mode has nowhere to write — surface an + // InvalidConfig at build() time rather than silently degrading. + let dim = 32; + let n = 100; + let vectors = random_unit_vectors(n, dim, 0xFADE); + + let config = DiskAnnConfig { + dim, + max_degree: 16, + build_beam: 32, + search_beam: 32, + alpha: 1.2, + // storage_path intentionally None + ..Default::default() + } + .with_rabitq_seed(1) + .with_quantizer_kind(QuantizerKind::Rabitq) + .with_originals_in_memory(false); + + let mut index = DiskAnnIndex::new(config); + let entries: Vec<(String, Vec)> = vectors + .iter() + .enumerate() + .map(|(i, v)| (format!("v{i}"), v.clone())) + .collect(); + index.insert_batch(entries).unwrap(); + let err = index.build().unwrap_err(); + match err { + DiskAnnError::InvalidConfig(msg) => { + assert!( + msg.contains("storage_path"), + "expected error mentioning storage_path, got: {msg}" + ); + } + other => panic!("expected InvalidConfig, got {other:?}"), + } +}