diff --git a/Cargo.lock b/Cargo.lock index 4d55df0..da2704c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -78,7 +78,7 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "difflib-fast" -version = "0.3.0" +version = "0.3.5" dependencies = [ "metal", "mimalloc", diff --git a/Cargo.toml b/Cargo.toml index b59ba55..d653378 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "difflib-fast" -version = "0.3.0" +version = "0.3.5" description = "Fast, byte-for-byte exact difflib Ratcliff–Obershelp (gestalt) similarity ratio + single-linkage clustering (suffix automaton), plus an exact all-pairs weighted-cosine similarity join (L2AP, CPU+GPU)." keywords = ["difflib", "similarity", "ratcliff-obershelp", "suffix-automaton", "fuzzy"] categories = ["algorithms", "text-processing"] diff --git a/README.md b/README.md index 2c7d3de..9aaa5e1 100644 --- a/README.md +++ b/README.md @@ -218,7 +218,7 @@ pick the wheel for you — grab the one for your platform from the ```bash # macOS Apple Silicon — swap the filename for your platform (see below): -pip install https://github.com/prostomarkeloff/difflib-fast/releases/download/v0.3.0/difflib_fast-0.3.0-cp39-abi3-macosx_11_0_arm64.whl +pip install https://github.com/prostomarkeloff/difflib-fast/releases/download/v0.3.5/difflib_fast-0.3.5-cp39-abi3-macosx_11_0_arm64.whl ``` | platform | wheel suffix | diff --git a/examples/simjoin_pypi.rs b/examples/simjoin_pypi.rs index 79b37aa..d2ff8d6 100644 --- a/examples/simjoin_pypi.rs +++ b/examples/simjoin_pypi.rs @@ -55,7 +55,16 @@ fn main() { let t: f64 = arg(2, 0.8); let reps: usize = arg(3, 3); - let rows = load(&path); + let mut rows = load(&path); + // Optional row subset (SJ_NSUB) to bench the small-corpus regime (e.g. find-dup-defs scale + // ~3216 functions) instead of the full 287k bandwidth-bound regime. + if let Ok(nsub) = std::env::var("SJ_NSUB") { + if let Ok(k) = nsub.parse::() { + if rows.len() > k { + rows.truncate(k); + } + } + } let n = rows.len(); let nnz: usize = rows.iter().map(Vec::len).sum(); diff --git a/src/simjoin.rs b/src/simjoin.rs index 13057fa..9968cc9 100644 --- a/src/simjoin.rs +++ b/src/simjoin.rs @@ -424,13 +424,34 @@ impl CosineJoiner { /// `1e-9 ≫` the `~1e-15` accumulated error over ~15 terms. const PRUNE_SLACK: f64 = 1e-9; +/// Probe row length at/above which the cheap monotone pre-bound in [`verify_pruned`] is worth it. +/// The pre-bound skips the per-candidate `partition_point` for candidates that can't reach `t`, but +/// `partition_point` over a *short* probe row is already cheap, so on sparse corpora (mean nnz ≈ 11, +/// e.g. `PyPI` type3) the pre-bound's own `fmul`/`fadd`/compare is pure overhead that rarely prunes +/// (match-dense ⇒ high survivor rate). It only pays on **dense** rows (e.g. find-dup-defs +/// patternology, mean nnz ≈ 61) where `partition_point` is costly and the survivor rate is tiny. +/// Gating on `di.len()` (a per-probe, loop-invariant test) keeps the sparse regime regression-free. +const PREBOUND_MIN_DIMS: usize = 24; + /// Phase 1 — accumulate: for each indexed dim of the probe, add `w_probe·w_y` into `acc[y]` for /// every **earlier** `y` (`y < cutoff`, the probe's own id) indexing that dim. Postings are id-sorted, /// so we `break` at the first `y ≥ cutoff`. Leaves `acc` `-1.0` everywhere except the touched ids /// (listed in `touched`, reset in [`verify_pruned`]). One scattered `acc[]` FMA per posting. #[cfg_attr(feature = "profiling", inline(never))] fn accumulate(index: &[Vec<(u32, f64)>], (di, wi): (&[u32], &[f64]), cutoff: u32, s: &mut Scratch) { - s.touched.clear(); + // Split the borrows so `acc` and `touched` are independent: otherwise the optimiser, seeing + // `s.touched.push(...)` go through the same `&mut Scratch`, can't prove the push leaves + // `s.acc`'s base pointer untouched and reloads `acc.ptr` from the struct on EVERY posting entry + // (one extra load across the ~5M-entry accumulation on dense corpora). With `acc` a separate + // `&mut [f64]` local the base stays in a register; a `touched` realloc can't alias it. + let Scratch { acc, touched, .. } = s; + let acc = acc.as_mut_slice(); + touched.clear(); + // At most one distinct candidate per acc slot, so `n == acc.len()` slots is enough headroom to + // append every first-touch without bounds checks or a realloc inside the loop. + touched.reserve(acc.len()); + let tptr = touched.as_mut_ptr(); + let mut tlen = 0usize; for (&d, &w) in di.iter().zip(wi) { for &(y, wy) in &index[d as usize] { if y >= cutoff { @@ -438,14 +459,23 @@ fn accumulate(index: &[Vec<(u32, f64)>], (di, wi): (&[u32], &[f64]), cutoff: u32 } let yu = y as usize; // SAFETY: `y` is a vector id pushed by `index_suffix`, so `yu < n == acc.len()`. - let a = unsafe { s.acc.get_unchecked_mut(yu) }; - if *a < 0.0 { - *a = 0.0; - s.touched.push(y); - } - *a += w * wy; + let a = unsafe { acc.get_unchecked_mut(yu) }; + // Branchless first-touch: the `*a < 0.0` test fed a data-dependent branch that mispredicts + // (first-touch vs repeat interleave unpredictably across the probe's dims). Instead select + // the base (`0.0` on first touch, current partial dot otherwise) with a conditional move, + // and append `y` to `touched` by an UNCONDITIONAL store that's only *committed* when the + // length is bumped — `tlen += first`. Same accumulator values and same touched order as + // the branchy form, so the result is bit-identical; just no mispredicting branch. + let first = *a < 0.0; + let base = if first { 0.0 } else { *a }; + *a = base + w * wy; + // SAFETY: `tlen ≤ distinct candidates so far < acc.len() ≤ touched.capacity()`. + unsafe { *tptr.add(tlen) = y }; + tlen += usize::from(first); } } + // SAFETY: `tlen` first-touch stores were written into the reserved region, in order. + unsafe { touched.set_len(tlen) }; } /// Phase 2 — prune + verify. For each touched candidate `y`, reset its accumulator and test the @@ -474,6 +504,13 @@ fn verify_pruned( sq += w * w; s.xpn.push(sq.sqrt()); } + // `‖probe‖` = the full prefix norm. Since `xpn` is monotonic, `xpn[kstar] ≤ xnorm` for every + // candidate, so `a + xnorm·pnorm ≥ a + xpn[kstar]·pnorm` (the exact bound). A candidate failing + // the cheap `a + xnorm·pnorm < need` test therefore also fails the exact bound — prune it without + // the per-candidate `partition_point`. The survivor set is bit-identical; only the binary search + // is skipped for the ~99% of touched candidates that can't reach `t` on dense corpora. + let xnorm = sq.sqrt(); + let prebound = di.len() >= PREBOUND_MIN_DIMS; let need = t - PRUNE_SLACK; let Scratch { acc, touched, xpn } = s; // (Software-prefetching the candidate row a few ahead was tried + reverted: no measurable change @@ -485,6 +522,11 @@ fn verify_pruned( let a = unsafe { std::mem::replace(acc.get_unchecked_mut(yu), -1.0) }; // SAFETY: `yu < n`. One scattered load fetches both prune fields. let bd = unsafe { *cached.bound.get_unchecked(yu) }; + // Cheap monotone pre-bound (dense rows only) — skip the binary search when even `xnorm` + // can't clear `need`. `prebound` is loop-invariant, so sparse rows pay nothing. + if prebound && a + xnorm * bd.pnorm < need { + continue; + } // Number of probe dims with rank < split[y] → index into xpn (di sorted ascending). let kstar = di.partition_point(|&d| d < bd.split); // SAFETY: kstar ≤ di.len() == wi.len() = xpn.len()-1. @@ -562,6 +604,8 @@ fn collect_survivors(c: &Corpus, i: usize, t: f64, s: &mut Scratch, cached: &Cac sq += w * w; s.xpn.push(sq.sqrt()); } + let xnorm = sq.sqrt(); + let prebound = di.len() >= PREBOUND_MIN_DIMS; let need = t - PRUNE_SLACK; let Scratch { acc, touched, xpn } = s; for &y in touched.iter() { @@ -569,6 +613,10 @@ fn collect_survivors(c: &Corpus, i: usize, t: f64, s: &mut Scratch, cached: &Cac // SAFETY: `yu < n` (same provenance as in `accumulate`). let a = unsafe { std::mem::replace(acc.get_unchecked_mut(yu), -1.0) }; let bd = unsafe { *cached.bound.get_unchecked(yu) }; + // Cheap monotone pre-bound (see `verify_pruned`) — same survivor set, skips the binary search. + if prebound && a + xnorm * bd.pnorm < need { + continue; + } let kstar = di.partition_point(|&d| d < bd.split); let bound = a + unsafe { xpn.get_unchecked(kstar) } * bd.pnorm; if bound >= need { @@ -700,10 +748,15 @@ pub fn cosine_join_counts(c: &Corpus, t: f64) -> (u64, u64, u64) { sq += w * w; s.xpn.push(sq.sqrt()); } + let xnorm = sq.sqrt(); + let prebound = di.len() >= PREBOUND_MIN_DIMS; for &y in &s.touched { let yu = y as usize; let a = std::mem::replace(&mut s.acc[yu], -1.0); let bd = cached.bound[yu]; + if prebound && a + xnorm * bd.pnorm < need { + continue; + } let kstar = di.partition_point(|&d| d < bd.split); if a + s.xpn[kstar] * bd.pnorm >= need { survivors += 1;