diff --git a/.gitignore b/.gitignore index d546a066..39168561 100644 --- a/.gitignore +++ b/.gitignore @@ -73,6 +73,7 @@ docs/parity-analysis/* !docs/parity-analysis/notes/ !docs/parity-analysis/notes/2026-05-25-precursor-cal-ship-gates.md !docs/parity-analysis/notes/2026-05-25-spece-tail-exploration.md +!docs/parity-analysis/notes/2026-05-28-phase2-peak-rank-parity.md !docs/parity-analysis/notes/2026-05-26-score-psm-trace-findings.md !docs/parity-analysis/notes/score-psm-trace-artifacts/ !docs/parity-analysis/notes/score-psm-trace-artifacts/* diff --git a/crates/msgf-rust/src/bin/msgf-trace.rs b/crates/msgf-rust/src/bin/msgf-trace.rs index fadbc5bf..7530ea4b 100644 --- a/crates/msgf-rust/src/bin/msgf-trace.rs +++ b/crates/msgf-rust/src/bin/msgf-trace.rs @@ -168,6 +168,11 @@ struct Cli { /// existing human-readable stderr trace is unaffected. #[arg(long)] trace_json: Option, + /// Dump the post-filter, post-deconvolution active peak list (sorted by + /// rank ascending) for this scan/charge as `rankmzintensity` + /// lines, preceded by a `DUMP_PEAKS` header. Read-only diagnostic. + #[arg(long)] + dump_peaks: bool, } fn main() -> ExitCode { @@ -584,6 +589,27 @@ fn run(cli: Cli) -> Result<(), Box> { println!(" PSM.score (from queue) = {}", top1.score); } + // --------------------------------------------------------------------- + // Diagnostic: dump the active (post-filter, post-deconvolution) peak list + // sorted by rank ascending. Read-only; uses the SAME peak/rank set the + // scorer consumes. Lets us compare Rust's kept-peak ranks against Java's. + // --------------------------------------------------------------------- + if cli.dump_peaks { + let dump_charge = charges_to_try.first().copied().unwrap_or(cli.charge_min); + let scored = ScoredSpectrum::new(spec, &scorer, dump_charge); + let active = scored.dump_active_peaks(); + println!( + "DUMP_PEAKS scan={} charge={} precursor_mz={} active_peaks={}", + cli.scan, + dump_charge, + spec.precursor_mz, + active.len() + ); + for (rank, mz, intensity) in &active { + println!("{}\t{:.5}\t{:.2}", rank, mz, intensity); + } + } + // --------------------------------------------------------------------- // Diagnostic: per-node GF ScoreDist dump for a specified peptide. // --------------------------------------------------------------------- diff --git a/crates/scoring/src/scoring/scored_spectrum.rs b/crates/scoring/src/scoring/scored_spectrum.rs index ca7ffc49..9d7ade33 100644 --- a/crates/scoring/src/scoring/scored_spectrum.rs +++ b/crates/scoring/src/scoring/scored_spectrum.rs @@ -467,6 +467,25 @@ impl<'a> ScoredSpectrum<'a> { self.parent_mass } + /// Diagnostic-only accessor: return the active peak list (post-deconvolution + /// when `apply_deconvolution` was applied, else the filtered original) as + /// `(rank, mz, intensity)` triples sorted by rank ascending (rank 1 = most + /// intense). Filtered-out peaks (rank == `u32::MAX`) are skipped. + /// + /// Read-only — does not affect scoring. Used by `msgf-trace --dump-peaks` + /// to compare Rust's kept-peak/rank assignment against Java's. + pub fn dump_active_peaks(&self) -> Vec<(u32, f64, f32)> { + let (peaks, ranks) = self.active_peaks_and_ranks(); + let mut out: Vec<(u32, f64, f32)> = peaks + .iter() + .zip(ranks.iter()) + .filter(|(_, &rank)| rank != u32::MAX) + .map(|(&(mz, intensity), &rank)| (rank, mz, intensity)) + .collect(); + out.sort_by(|a, b| a.0.cmp(&b.0)); + out + } + /// Return a cached `round(prefix_score + suffix_score)` split score when /// both nominal masses are in-bounds for this spectrum's FastScorer-style /// tables. Returns `None` when the cache is unavailable or either index is diff --git a/crates/search/src/mass_calibrator.rs b/crates/search/src/mass_calibrator.rs index a227cf73..ca55246f 100644 --- a/crates/search/src/mass_calibrator.rs +++ b/crates/search/src/mass_calibrator.rs @@ -150,7 +150,7 @@ pub fn learn_calibration_stats( originals, &prepared.candidates, MIN_DE_NOVO_SCORE, - constants::MIN_CONFIDENT_PSMS, + constants::RESIDUAL_CAP, ); if residuals.len() < constants::MIN_CONFIDENT_PSMS { diff --git a/crates/search/src/precursor_cal.rs b/crates/search/src/precursor_cal.rs index 755e7659..3b82d154 100644 --- a/crates/search/src/precursor_cal.rs +++ b/crates/search/src/precursor_cal.rs @@ -76,7 +76,18 @@ pub fn adjusted_observed_neutral_mass(raw_neutral: f64, shift_ppm: f64) -> f64 { pub mod constants { pub const SAMPLING_STRIDE: usize = 10; pub const MAX_SAMPLED: usize = 500; - pub const MIN_CONFIDENT_PSMS: usize = 200; + /// Upper bound on the number of best-SpecEValue residuals fed to the median + /// shift estimate. Java's `MassCalibrator` uses the top 200. Decoupled from + /// the firing minimum below so high-yield datasets (Astral/TMT) keep using a + /// full 200-residual estimate unchanged. + pub const RESIDUAL_CAP: usize = 200; + /// Minimum confident residuals required to TRUST the learned shift. Lowered + /// from 200 to 150: low-res-MS2 datasets (e.g. PXD001819 ion-trap CID) yield + /// fewer sub-1e-6 SpecEValue PSMs from the 500-spectrum sample (~193), which + /// previously fell just under 200 and skipped calibration entirely. 150 is + /// still a robust sample size for a median, and the cap above keeps the + /// high-yield datasets' estimate identical. + pub const MIN_CONFIDENT_PSMS: usize = 150; pub const MAX_SPEC_EVALUE: f64 = 1e-6; pub const MIN_SPECKEYS_FOR_PREPASS: usize = 10_000; diff --git a/docs/parity-analysis/notes/2026-05-28-phase2-peak-rank-parity.md b/docs/parity-analysis/notes/2026-05-28-phase2-peak-rank-parity.md new file mode 100644 index 00000000..985f7133 --- /dev/null +++ b/docs/parity-analysis/notes/2026-05-28-phase2-peak-rank-parity.md @@ -0,0 +1,78 @@ +# Phase 2 finding: peak-rank assignment is bit-identical to Java (H2 is NULL) + +**Date:** 2026-05-28 +**Branch:** `feat/id-rate-pxd001819-tmt` +**Dataset:** PXD001819, scan 41522 (a documented I5 label-flip scan) +**Tooling:** `msgf-trace --dump-peaks` (commit `12c1839c`) — dumps the post-filter, +post-deconvolution active peak list (rank, m/z, intensity). + +## Headline + +**Rust and Java assign the identical rank to every peak.** Comparing Rust's +dumped active peak list against Java's per-ion trace ranks, matched by *observed +peak m/z* (not theoretical-ion m/z), the rank offset is **+0 for all 465 matched +peaks**. No exceptions. + +This **debunks the I5 doc's central hypothesis** (`2026-05-26-score-psm-trace-findings.md`), +which attributed ~40% of the scoring divergence to H2 (peak-rank assignment) and +made it the primary fix target for this whole investigation. + +## What went wrong in the original I5 analysis + +The I5 `analyze.py` (and an even cruder ad-hoc script) aligned Rust↔Java ions by +`(ion_kind, round(theo_mz/1e-3))` — i.e. by *theoretical* ion m/z. The same +theoretical m/z can correspond to **different physical ions** in the two peptides +being compared (e.g. Rust `y/1` at theo_mz X vs Java `y/2+offset` at a +coincidentally-equal theo_mz X). Those spurious cross-matches produced the +RANK_DIFF=301 count and the apparent "+2 rank offset." + +When you instead match **actual observed peaks by their m/z** (what +`--dump-peaks` enables), the ranks are identical. The peak ranker +(`ScoredSpectrum::new` intensity-desc + m/z-asc, precursor-filtered peaks excluded) +already matches Java's `Spectrum.setRanksOfPeaks()` (intensity-desc via +`IntensityComparator`, precursor peaks zeroed-but-ranked-at-bottom) for every peak +that matters. + +Confirmed details: +- Java zeroes precursor peaks but keeps them in the ranked list (sorted to the + bottom at intensity 0); Rust removes them. This makes Java's total peak count + ~3 higher (max rank 489 vs Rust active 486) but does **not** shift any real + peak's rank, because the zeroed peaks sort below every real peak either way. +- Rust's precursor-filter tolerance (`pof.tolerance`) already equals Java's `mme` + (0.5 Da for CID_LowRes) for these offsets — a test swapping to `param.mme` left + the active-peak count unchanged at 486 (reverted as a no-op). + +## The label-flip still exists, but is not a rank bug + +On current master, scan 41522's Rust top-1 is a decoy (`VVYGNIYEIEIDRLFLTDQR`, +score 13, SpecE 4.66e-4) — Java picks a real peptide. But: +- The scores are tiny (top non-decoy is 11), so this is a low-quality spectrum + near the noise floor. +- Since peak ranks are identical, the divergence must be in **candidate + enumeration (H1)** — whether Rust enumerates/locates the Java-favored peptide in + this scan's mass window — and/or **log-prob table values (H3)**, not rank + assignment. + +## Implication for the +10% goal + +- The single lever the whole Phase 2 plan was built around (fix peak-rank + assignment) **does not exist** — Rust already matches Java bit-for-bit there. +- Combined with the external reviewer's "BSA 217/217 top-1 parity" and Phase 0's + finding that instrument/tolerance/calibration are already correct, the picture + is: **Rust's scoring is at or near parity with Java.** The residual gap + (PXD001819 −1.1% after Phase 1b's +53; TMT −5%) is small, concentrated on + low-quality / SpecE-tail spectra, and attributable to H1/H3 micro-differences + that the n=9+ audit says regress Percolator when "fixed" individually. +- **+10% over current Rust is not reachable via scoring-path fixes** — the scoring + path is essentially correct. The shippable result from this investigation is + Phase 1b (+53 PXD, zero-risk calibration nudge). + +## Recommended next directions (if ID-rate work continues) + +1. **Stop chasing rank/score parity** — it's already there. +2. If a large ID gain is still wanted, it likely requires *algorithmic* changes + (e.g. a different candidate-generation / scoring model), not parity fixes — a + research project, not a bench-gated tweak. +3. H1 (does Rust enumerate the Java-favored peptide in-window?) is the only + remaining parity question worth a *cheap* check, but the per-scan evidence + suggests these are low-score spectra that don't cross 1% FDR anyway. diff --git a/docs/superpowers/plans/2026-05-28-id-rate-pxd001819-tmt-plan.md b/docs/superpowers/plans/2026-05-28-id-rate-pxd001819-tmt-plan.md new file mode 100644 index 00000000..a405f633 --- /dev/null +++ b/docs/superpowers/plans/2026-05-28-id-rate-pxd001819-tmt-plan.md @@ -0,0 +1,385 @@ +# PXD001819 + TMT ID-rate Improvement Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. +> +> **This is a bench-gated investigation, not a deterministic build.** The ultimate +> pass/fail gate for each change is the Percolator 1%-FDR PSM count on the VM, not a +> unit test. Phase 0 is a hard prerequisite: its measurements decide whether Phase 1 +> has any yield. Phase 2's exact code edit is determined by a trace whose result is +> not known in advance — that task is written as an investigation protocol with a +> concrete fix location, candidate rules, and verification, NOT a pre-written diff. + +**Goal:** Lift PXD001819 (14,755) and TMT (9,605) PSM counts @1% FDR toward +10% over current Rust, without regressing wall >3% or regressing Astral (36,715). + +**Architecture:** One feature branch (`feat/id-rate-pxd001819-tmt`), one commit per change, each bench-gated on all three datasets via the existing VM harness, with in-place revert when the gate fails. Highest-leverage / lowest-risk first. + +**Tech Stack:** Rust (workspace crates: `scoring`, `search`, `input`, `output`, `msgf-rust`); bench VM `pride-linux-vm` via SSH control socket `/tmp/msgfplus-bench.sock`; Percolator 3.7.1 in Docker; Python stdlib for the I5 trace harness. + +--- + +## Conventions used throughout + +**Build (local):** `cargo build --release -p msgf-rust` (the committed `.cargo/config.toml` sets `target-cpu=sandybridge`). + +**Bit-identical regression gate (local):** +```bash +cargo test --release -p msgf-rust precursor_cal_off_pin_tsv_match_golden_after_sort +``` +Expected: `test result: ok. 1 passed`. (Phase 2 changes top-1 selection, so this gate will legitimately change — see Task 2a Step 6 for how to regenerate goldens. Phases 0/1-diagnostic/3 must keep it green.) + +**Workspace tests (local), CI skip list:** +```bash +cargo test --release --workspace -- \ + --skip charge_missing_spectrum_uses_per_charge_scored_spec \ + --skip spectrum_without_charge_tries_charge_range \ + --skip known_peptide_appears_in_top_n \ + --skip read_bsa_canno_text_format \ + --skip read_tryp_pig_bov_revcat_csarr_cnlcp \ + --skip tryp_pig_bov_revcat_full_set_loads \ + --skip match_spectra_output_invariant_across_thread_counts +``` + +**Clippy gate (local):** `cargo clippy --workspace --all-targets -- -D warnings` → clean. + +**VM bench (per change):** ship source, build, run 3 datasets cal=auto, run Percolator. The canonical baseline numbers to beat/hold: + +| Dataset | Rust @1% FDR (baseline) | Wall (baseline) | +|---|---:|---:| +| PXD001819 | 14,755 | ~0:54 | +| TMT | 9,605 | ~2:33 | +| Astral | 36,715 | ~6:28 | + +**Bench ship gate (per change):** gains PXD or TMT @1% FDR; regresses none of PXD/TMT/Astral beyond ~±0.3% (≈ ±45 PXD, ±29 TMT, ±110 Astral); wall within ~3% on all three. Otherwise `git revert` the commit in place. + +**VM socket precondition:** every VM step needs `ssh -S /tmp/msgfplus-bench.sock -O check root@pride-linux-vm` to succeed. If it fails, STOP and ask the human to run `ssh -M -S /tmp/msgfplus-bench.sock -fN root@pride-linux-vm`. + +--- + +## Phase 0 — Diagnostic (prerequisite; no code change) + +### Task 0: Measure what PXD001819 and TMT actually resolve to + +**Files:** none (measurement only). Produces `docs/parity-analysis/notes/2026-05-28-id-rate-phase0-diagnostic.md` (gitignored notes dir; local record). + +- [ ] **Step 1: Verify the VM socket is alive** + +Run: +```bash +ssh -S /tmp/msgfplus-bench.sock -O check root@pride-linux-vm +``` +Expected: `Master running (pid=NNNN)`. If not, STOP and request the human re-establish it. + +- [ ] **Step 2: Capture each Rust run's resolver + calibration stderr** + +The previous bench logs are at `/srv/data/msgf-bench/bench-v2024-results/{pxd001819,tmt,astral}-rust-auto.log`. Extract the resolver + calibration lines: +```bash +ssh -S /tmp/msgfplus-bench.sock root@pride-linux-vm \ + "for d in pxd001819 tmt astral; do echo \"== \$d ==\"; grep -iE 'Param resolver|instrument =|precursor.?cal|MassCalibrator|calibrat|confident|high.res|tolerance' /srv/data/msgf-bench/bench-v2024-results/\${d}-rust-auto.log; done" +``` +Expected: lines like `Param resolver: auto-detected dominant activation method = HCD (instrument = ...)`. Record the resolved instrument string per dataset. + +- [ ] **Step 3: Read the true instrument analyzer from each mzML** + +```bash +ssh -S /tmp/msgfplus-bench.sock root@pride-linux-vm \ + "for f in /srv/data/msgf-bench/data/UPS1_5000amol_R1.mzML /srv/data/msgf-bench/tmt-data/a05058.mzML /srv/data/msgf-bench/astral-data/LFQ_Astral_DDA_15min_50ng_Condition_A_REP1.mzML; do echo \"== \$f ==\"; grep -aoE 'MS:100(0079|0083|0084|0264|0484|0624|1906)|FTMS|ITMS|orbitrap|Q Exactive|Astral|ion trap' \$f | sort | uniq -c | head; done" +``` +Map cvParams: `MS:1000484` orbitrap, `MS:1000079` FTICR, `MS:1000264` ion trap, `MS:1000083` radial ejection ion trap. This tells the *true* MS2 analyzer. + +- [ ] **Step 4: Determine the tolerance branch actually taken** + +For each dataset, the scoring/feature tolerance is high-res (20 ppm) iff the resolved `InstrumentType::is_high_resolution()` is true. Find which `InstrumentType` enum value `is_high_resolution()` returns true for: +```bash +grep -n -A8 "fn is_high_resolution" crates/model/src/*.rs crates/scoring/src/*.rs 2>/dev/null +``` +Cross-reference the Step-2 resolved instrument against this. Record per dataset: `tolerance = 20ppm | 0.5Da`. + +- [ ] **Step 5: Write the diagnostic table** + +Create `docs/parity-analysis/notes/2026-05-28-id-rate-phase0-diagnostic.md` with the filled table: +```markdown +| Dataset | True MS2 analyzer | Resolved instrument | is_high_resolution | Feature/scoring tol | Calibration fired? | +|---|---|---|---|---|---| +| PXD001819 | ... | ... | ... | ... | ... | +| TMT | ... | ... | ... | ... | ... | +| Astral | (orbitrap/Astral) | (high-res) | true | 20ppm | yes (reference) | +``` + +- [ ] **Step 6: Decide Phase 1 scope (record in the same note)** + +Decision rules: +- If PXD or TMT shows `is_high_resolution = false` / `tol = 0.5Da` but the true analyzer is orbitrap/FT → **Phase 1a is live** (iter20-style win available). +- If calibration shows "skipped" on PXD or TMT → **Phase 1b is live**. +- If both are already high-res + calibrated → skip Phase 1, go to Phase 2. + +Write the decision explicitly. No commit (notes dir is gitignored); this task gates the rest. + +--- + +## Phase 1 — Config levers (only the sub-tasks Phase 0 marked "live") + +### Task 1a: Instrument-resolution / tolerance fix (CONDITIONAL on Phase 0 Step 6) + +**Files:** +- Modify: `crates/msgf-rust/src/bin/msgf-rust.rs` (instrument resolution at ~L585–631) — exact edit depends on the Phase-0 root cause (see Step 2). +- Possibly: `crates/model/src/.rs` (`is_high_resolution` mapping) or `crates/search/src/match_engine.rs::compute_psm_features` (tolerance branch). + +- [ ] **Step 1: Reproduce the mis-resolution locally** + +If PXD001819's mzML is available locally, run; otherwise reason from the Phase-0 evidence. Confirm the code path: does `detect_instrument_type` return the orbitrap type, and does that type's `is_high_resolution()` return true? If detection returns `None` and the code falls back to low-res (`crates/msgf-rust/src/bin/msgf-rust.rs:605` comment "None ⇒ resolver picks LowRes"), the bug is in detection, not the tolerance branch. + +- [ ] **Step 2: Make the targeted fix** + +Two likely shapes (pick per Step 1 evidence): +- **Detection gap:** `detect_instrument_type` fails to recognize the analyzer cvParam present in the mzML. Add the missing cvParam mapping in `crates/input/src/mzml.rs` (the `CV_ANALYZER_*` constants near L20–40 and the match at ~L722–735). +- **Tolerance-branch gap:** detection is correct but `compute_psm_features` / scoring tolerance still uses 0.5 Da. Route the resolved high-res `InstrumentType` into the `is_high_resolution()` check. + +Show the diff in the commit; do not guess it here. + +- [ ] **Step 3: Local gates** + +Run the bit-identical gate, workspace tests, clippy (commands in Conventions). The bit-identical golden is cal=off low-res, so a high-res-detection fix should NOT change it (different code path) — expected still green. If it changes, investigate before proceeding. + +- [ ] **Step 4: Commit** +```bash +git add -A +git commit -m "fix(resolve): detect high-res instrument so 20ppm tolerance engages on PXD/TMT" +``` + +- [ ] **Step 5: VM bench + gate** + +Ship, build, bench all 3 datasets (use the bench recipe in Task 2a Step 5). Apply the bench ship gate. If it regresses, `git revert HEAD` and record the result in the Phase-0 note. + +### Task 1b: Calibration engagement (CONDITIONAL on Phase 0 Step 6) + +**Files:** +- Modify: the MassCalibrator confident-PSM guard (find it): +```bash +grep -rn "confident\|min.*confident\|200\|calibrat" crates/scoring/src crates/search/src --include=*.rs | grep -i cal | head +``` + +- [ ] **Step 1: Locate the skip guard and log the actual count** + +Find where calibration decides to skip (the "<200 confident PSMs" guard from memory). Confirm via the Phase-0 stderr what count PXD/TMT hit. + +- [ ] **Step 2: Decide the change** + +If the guard is skipping with, say, 150 confident PSMs, evaluate lowering the threshold OR using a wider confident-PSM definition. This is a parameter change to a single constant — show it in the commit. + +- [ ] **Step 3: Local gates + commit** +```bash +cargo test --release -p msgf-rust precursor_cal_off_pin_tsv_match_golden_after_sort +git add -A && git commit -m "tune(cal): lower confident-PSM guard so calibration engages on PXD/TMT" +``` + +- [ ] **Step 4: VM bench + gate** (same recipe as Task 2a Step 5; revert if regresses). + +--- + +## Phase 2 — Label-flip gap (hot-path; investigation + bench gate) + +### Task 2a: H2 peak-rank assignment — match Java's `getPeakByMass` rule + +**Files:** +- Modify: `crates/scoring/src/scoring/scored_spectrum.rs` — `nearest_peak_rank_in` (~L897–918) and/or the rank-assignment sort in `ScoredSpectrum::new` (~L199–237). +- Test: `crates/scoring/src/scoring/scored_spectrum.rs` (inline `#[cfg(test)]` module). + +- [ ] **Step 1: Verify the VM socket; rebuild msgf-trace if needed** + +```bash +ssh -S /tmp/msgfplus-bench.sock -O check root@pride-linux-vm +``` +The I5 artifacts are committed at `docs/parity-analysis/notes/score-psm-trace-artifacts/`. The 5 traced scans: 41522, 34685, 23272, 23082, 16629. + +- [ ] **Step 2: Identify one concrete RANK_DIFF ion** + +From the committed artifacts, find the first RANK_DIFF ion on scan 41522: +```bash +cd docs/parity-analysis/notes/score-psm-trace-artifacts +python3 analyze.py 2>/dev/null | grep -i rank_diff | head +# Then inspect the specific ion in rust-trace-scan-41522.json: +python3 -c "import json; d=json.load(open('rust-trace-scan-41522.json')); [print(p['peptide'], [i for i in p['ions'] if 'rank' in i][:3]) for p in d]" 2>/dev/null | head +``` +Record: `(theo_mz, rust_rank, java_rank)` for the first divergent ion. Decompress the Java side if needed: `gunzip -k java-trace-scan-41522.log.gz`. + +- [ ] **Step 3: Read Java's `getPeakByMass` rule** + +The Java reference is on the bench VM clone `/srv/data/msgf-bench/java-legacy-trace/`. Find the peak-selection rule: +```bash +ssh -S /tmp/msgfplus-bench.sock root@pride-linux-vm \ + "grep -rn -A20 'getPeakByMass' /srv/data/msgf-bench/java-legacy-trace/src/main/java/ | head -40" +``` +Determine Java's tie-break when multiple peaks fall in tolerance: first-in-list? closest-m/z? highest-intensity? Rust currently uses **highest-intensity, strict `>` (first wins on tie)**. Record the exact divergence. + +- [ ] **Step 4: Write a failing unit test that encodes Java's rule** + +In `scored_spectrum.rs` tests, construct a tiny spectrum with two peaks inside one tolerance window where Java's rule and Rust's current rule disagree (use the real (theo_mz, ranks) from Step 2 if it reproduces, else a synthetic case matching the rule found in Step 3). Assert `nearest_peak_rank_in` returns Java's choice. +```rust +#[test] +fn nearest_peak_rank_matches_java_getpeakbymass_tiebreak() { + // two peaks within tol of target; Java picks + // assert the rank returned equals Java's pick +} +``` +Run: `cargo test --release -p scoring nearest_peak_rank_matches_java -- --nocapture` → Expected: FAIL. + +- [ ] **Step 5: Implement the rule change; verify the unit test passes** + +Change the selection in `nearest_peak_rank_in` (and the rank-assignment sort if the divergence is there) to match Java. Run the unit test → Expected: PASS. Then run the full `scoring` crate tests. + +- [ ] **Step 6: Regenerate the bit-identical golden (top-1 legitimately changes)** + +This change alters top-1 selection, so `precursor_cal_off.pin/.tsv` goldens shift. Regenerate them per the repo's golden-update procedure: +```bash +grep -rn "precursor_cal_off_pin_tsv_match_golden" crates/msgf-rust/tests/ | head +# Follow the test's documented regeneration path (typically an env var or a +# regen helper); inspect the test to confirm before regenerating. +``` +Commit the regenerated goldens together with the code change so the gate stays meaningful. + +- [ ] **Step 7: Commit** +```bash +git add -A +git commit -m "fix(scoring): match Java getPeakByMass tie-break in peak-rank assignment (H2)" +``` + +- [ ] **Step 8: Re-run the I5 trace harness on the VM; confirm RANK_DIFF drops** + +Rebuild msgf-trace on the VM, re-run the 5-scan trace, re-run `analyze.py`. Expected: RANK_DIFF count drops from 301; LOGPROB_DIFF drops proportionally. If RANK_DIFF does NOT drop, the rule change was wrong — revisit Step 3. + +- [ ] **Step 9: VM bench + gate (the decisive test)** + +Bench recipe: +```bash +# ship source (rsync/scp changed crates), then on VM: +ssh -S /tmp/msgfplus-bench.sock root@pride-linux-vm \ + "cd /srv/data/msgf-bench/iter5-build && PATH=\$HOME/.cargo/bin:\$PATH cargo build --release -p msgf-rust" +# run 3 datasets cal=auto (reuse run_bench_*.sh pattern), then percolator: +ssh -S /tmp/msgfplus-bench.sock root@pride-linux-vm \ + "for l in pxd001819-rust-auto astral-rust-auto tmt-rust-auto; do bash /srv/data/msgf-bench/run_percolator_docker.sh /srv/data/msgf-bench/bench--results/\$l.pin /srv/data/msgf-bench/bench--percolator \$l; done" +``` +Apply the bench ship gate. **If Percolator regresses on any dataset beyond noise, `git revert HEAD` (and the goldens commit) — this is the n=9+ audit risk materializing.** Record the result. + +### Task 2b: H3 log-prob indexing (CONDITIONAL — only if 2a passed its gate) + +**Files:** +- Modify: `crates/scoring/src/scoring/rank_scorer.rs` (table indexing / clamping). +- Test: inline `#[cfg(test)]` in `rank_scorer.rs`. + +- [ ] **Step 1: Re-run analyze.py; isolate a pure-H3 ion** + +After 2a, find an ion with LOGPROB_DIFF but NO RANK_DIFF (same rank, different value) — there were 307 such cases pre-fix. Record `(partition, ion_type, rank, rust_logprob, java_logprob)`. + +- [ ] **Step 2: Compare Rust's table lookup to Java's for that ion** + +Trace `rank_scorer.rs` indexing (`idx = min(rank, max_rank).max(1) - 1`; missing uses `max_rank` slot) against Java's `getNodeScore` table lookup for the same (partition, ion, rank). Identify the off-by-one / clamping / table-content difference. + +- [ ] **Step 3: Failing unit test encoding Java's lookup** + +Construct a `RankScorer` from a known partition table and assert the log-prob at a specific rank matches Java's value for the recorded ion. Run → Expected FAIL. + +- [ ] **Step 4: Fix indexing; unit test passes; regenerate goldens; commit** +```bash +cargo test --release -p scoring +git add -A && git commit -m "fix(scoring): align per-rank log-prob table indexing with Java (H3)" +``` + +- [ ] **Step 5: VM bench + gate** (same recipe as 2a Step 9; revert if regresses). + +--- + +## Phase 3 — Additive PIN features (safety net; only if Phases 1–2 < +10%) + +### Task 3a: Add `MeanMatchedRank` PIN column + +**Files:** +- Modify: `crates/search/src/psm.rs` (`PsmFeatures` — add field). +- Modify: `crates/search/src/match_engine.rs::compute_psm_features` (compute it). +- Modify: `crates/output/src/pin.rs` (`write_header` + `write_psm_row` — emit it, inserted before `Peptide` next to `EdgeScore`). +- Test: `crates/output/src/pin.rs` tests + `crates/output/tests/output_pin_schema_parity.rs`. + +- [ ] **Step 1: Failing test for the new column in the header** + +In `crates/output/tests/output_pin_schema_parity.rs`, add an assertion that the header contains `MeanMatchedRank` at the expected position. Run → Expected FAIL. + +- [ ] **Step 2: Add the field to `PsmFeatures`** +```rust +// crates/search/src/psm.rs, in PsmFeatures +pub mean_matched_rank: f32, +``` + +- [ ] **Step 3: Compute it in `compute_psm_features`** + +After the matched-ion loop, average `nearest_peak_rank` over matched b/y ions (0.0 if none matched): +```rust +// crates/search/src/match_engine.rs, in compute_psm_features +let mean_matched_rank = if matched_rank_count > 0 { + matched_rank_sum as f32 / matched_rank_count as f32 +} else { 0.0 }; +// ...set features.mean_matched_rank = mean_matched_rank; +``` +(Accumulate `matched_rank_sum`/`matched_rank_count` inside the existing matched-ion loop where `nearest_peak_rank` is already called.) + +- [ ] **Step 4: Emit the column** + +In `crates/output/src/pin.rs::write_header`, insert `MeanMatchedRank` immediately before `EdgeScore`; in `write_psm_row`, write `features.mean_matched_rank` in the same position. Keep `EdgeScore` and `Peptide`/`Proteins` last. + +- [ ] **Step 5: Update schema-parity test + run** +```bash +cargo test --release -p output +cargo test --release --workspace -- +``` +Expected: PASS (column count +1 everywhere it's asserted). + +- [ ] **Step 6: Commit** +```bash +git add -A && git commit -m "feat(pin): add additive MeanMatchedRank feature column" +``` + +- [ ] **Step 7: VM bench + gate** + +Additive columns never change top-1, so PXD/TMT/Astral PSM *search* output is byte-identical pre-Percolator except the new column; the only delta is whether Percolator extracts signal from it. Bench all 3; keep if it gains, revert if it regresses (it should at worst be flat). + +### Task 3b: Add `ScoreFractionTop1Split` PIN column (only if 3a flat and still < target) + +**Files:** +- Modify: `crates/scoring/src/scoring/psm_score.rs::score_psm` — return the max single-split contribution alongside the sum (change return type to `(f32, f32)` or add a sibling fn to avoid disturbing callers). +- Modify: `crates/search/src/psm.rs`, `match_engine.rs`, `crates/output/src/pin.rs` as in 3a. + +- [ ] **Step 1: Failing header test** (as 3a Step 1, for `ScoreFractionTop1Split`). +- [ ] **Step 2: Track max split in `score_psm`** + +Inside the split loop that accumulates `total += contribution`, also track `max_contrib = max_contrib.max(contribution)`. Expose `max_contrib / total` (guard total==0 → 0.0). To avoid perturbing the hot `score_psm` signature used in ranking, add a separate `score_psm_with_split_max(...) -> (f32, f32)` used only in `compute_psm_features`, OR compute the fraction in `compute_psm_features` by re-deriving splits (prefer the former to avoid double work). + +- [ ] **Step 3–6:** wire field → compute → emit → schema test → bench gate, exactly as 3a Steps 2–7. + +--- + +## Final: cumulative bench + branch close-out + +### Task 4: Cumulative bench, PR, ship/revert reconciliation + +- [ ] **Step 1: Full 3-dataset bench at branch HEAD** + +Confirm the cumulative PXD/TMT/Astral @1% FDR and walls vs baseline. Build the result table. + +- [ ] **Step 2: Reconcile against the stretch target** + +Record cumulative deltas. If +10% reached, note it. If not, the shipped subset is whatever net-gained under the hard gate (the +10% is a direction, not a revert-all gate per the spec). + +- [ ] **Step 3: Open the PR** +```bash +git push -u origin feat/id-rate-pxd001819-tmt +gh pr create --base dev --title "perf(id-rate): close PXD001819 + TMT label-flip gap vs Java" --body "" +``` + +- [ ] **Step 4: Watch CI to green** (Lint + 3 OS tests + CodeRabbit), fix any failures. + +--- + +## Self-review notes + +- **Spec coverage:** Phase 0 → Task 0; Phase 1a → Task 1a; Phase 1b → Task 1b; Phase 2 H2 → Task 2a; Phase 2 H3 → Task 2b; Phase 3 → Tasks 3a/3b; success-criteria reconciliation → Task 4. All spec sections covered. +- **Conditionality is intentional:** Tasks 1a/1b/2b/3a/3b are explicitly gated on prior-task outcomes — this matches the spec's "outcome of each phase decides the next." An executor must honor the CONDITIONAL markers, not run every task blindly. +- **Phase 2 code is deliberately not pre-written:** the exact tie-break/indexing edit is unknowable until Steps 2–3 of each task read the real trace + Java source. The tasks pin the file:line, the candidate rules, the unit-test shape, and the bench gate — which is the maximum honest specificity here. +- **Golden regeneration** is called out explicitly in 2a/2b because those tasks change top-1 selection and will move the committed PIN/TSV goldens. diff --git a/docs/superpowers/specs/2026-05-28-id-rate-pxd001819-tmt-design.md b/docs/superpowers/specs/2026-05-28-id-rate-pxd001819-tmt-design.md new file mode 100644 index 00000000..4b6cc4d3 --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-id-rate-pxd001819-tmt-design.md @@ -0,0 +1,179 @@ +# ID-rate improvement for PXD001819 + TMT — design + +**Date:** 2026-05-28 +**Branch:** `feat/id-rate-pxd001819-tmt` +**Type:** Phased, bench-gated investigation (outcome of each phase decides the next) + +## Goal + +Increase peptide-spectrum-match (PSM) counts at 1% FDR (Percolator) for the two +datasets where msgf-rust currently trails or barely matches upstream Java +MS-GF+ v2024.03.26: + +| Dataset | Current Rust @1% | Upstream Java @1% | Stretch target (+10% over Rust) | +|---|---:|---:|---:| +| PXD001819 (UPS1 yeast tryptic) | 14,755 | 14,974 | ~16,230 | +| TMT (a05058, PXD007683) | 9,605 | 10,115 | ~10,565 | + +The +10%-over-Rust figure means *beating* upstream Java by ~8% (PXD) / ~4% (TMT) +— the same kind of edge msgf-rust already holds on Astral DDA (+9.8%: 36,715 vs +33,425). It is a **stretch target**, not a hard gate (see "Success criteria"). + +Constraints: +- **No wall-time regression beyond noise (~3%)** on any of the three datasets. +- **No PSM regression on Astral** (currently 36,715 @1% FDR — already beating Java). + +## Background + +The I5 trace investigation (`docs/parity-analysis/notes/2026-05-26-score-psm-trace-findings.md`, +merged via PR #37) localized the Rust↔Java per-PSM scoring divergence on +PXD001819. The gap is dominated by **label flips**: Rust ranks a different +(usually wrong) top-1 peptide than Java because its scoring over-scores weaker +peptides by +5..+20 RawScore and under-scores Java's favored peptide by ~13. +Three roughly-equal contributors, all in the hot scoring path: + +- **H1 — ion-type enumeration** (~10% of divergent ions): Rust scores some ions Java doesn't. +- **H2 — peak-rank assignment** (~40%): when several peaks fall inside the + fragment-tolerance window, the selected peak / assigned rank differs from Java's. +- **H3 — per-rank log-probability lookups** (~40%): a different log-prob value is + returned even when Rust and Java agree on the rank. + +The project's empirical audit pattern (n≈9+, recorded in workspace memory): +- **Additive** changes (new PIN columns) are *safe* but historically *flat* — + Percolator already extracts most of that signal via correlated columns. +- **Modifying-existing-distribution** changes usually *regress* Percolator @1% FDR + even when individually "more correct" — UNLESS they move Rust's top-1 selection + *toward Java's*. The three biggest historical gains were all top-1-restoring + changes: iter20 high-res tolerance fix (+4,650 Astral), iter33 edge-in-ranking + (+3,705), iter29 main-ion fix (+379). + +This design exploits that pattern: pursue the cheap top-1-restoring config lever +first, then the hot-path label-flip fix (also top-1-restoring), keeping additive +features as a zero-risk safety net. + +## Relevant code (from the 2026-05-28 code map) + +- Peak rank assignment (H2): `crates/scoring/src/scoring/scored_spectrum.rs` + - `ScoredSpectrum::new` (~L168–357): assigns ranks intensity-desc then m/z-asc; + precursor-filtered peaks excluded before ranking. + - `nearest_peak_rank_in` (~L897–918): binary search + linear scan in tolerance + window, selects **highest intensity** (strict `>`, so first peak wins on ties). + - `directional_node_score_inner` (~L715–750): tolerance is `mme.as_da(theo_mz)`. +- Log-prob tables (H3): `crates/scoring/src/scoring/rank_scorer.rs` + - `node_score` / hot-path indexing: `idx = min(rank, max_rank).max(1) - 1`; + missing ion uses the `max_rank` sentinel slot. + - `error_score`, `ion_existence_score`. +- Ion-type enumeration (H1): `crates/scoring/src/param_model.rs` + - `build_partition_ion_types_cache`, `partition_for` / `find_partition`. +- RawScore assembly: `crates/scoring/src/scoring/psm_score.rs::score_psm`; + `pin_score` (= node + cleavage) vs `rank_score` (= node + cleavage + edge) split + in `crates/search/src/match_engine.rs` (~L408–448). +- Instrument/tolerance resolution: `crates/msgf-rust/src/bin/msgf-rust.rs` (~L585–631); + `detect_instrument_type` in `crates/input/src/mzml.rs`; + `InstrumentType::is_high_resolution()` gates the 20-ppm vs 0.5-Da feature tolerance + in `compute_psm_features` (`crates/search/src/match_engine.rs`). +- PIN columns: `crates/output/src/pin.rs::write_header` / `write_psm_row`. +- Feature struct: `crates/search/src/psm.rs::PsmFeatures`. + +## Success criteria + +- **Hard gate (per change):** ships only if it gains PXD or TMT @1% FDR, regresses + *none* of PXD/TMT/Astral beyond Percolator noise (~±0.3%), and keeps wall within + ~3% on all three. Otherwise revert. +- **Stretch target:** +10% over current Rust on PXD and TMT. Treated as a + *direction we bench toward*, NOT a revert-everything gate — we ship every change + that net-gains under the hard gate, and report the cumulative result even if it + lands below +10%. +- **Speed:** Phase-2 hot-path changes must show no wall regression beyond noise; + revert if they do, even if PSMs gain. + +## Bench protocol (every phase) + +Reuse the established VM harness (`/srv/data/msgf-bench/`): +- Build Rust release with `target-cpu=sandybridge` (the committed `.cargo/config.toml`). +- Run all 3 datasets, cal=auto, `--top-n 1 --threads 8`, per-dataset + tolerance/instrument/protocol matching the README bench. +- Percolator 3.7.1 via `run_percolator_docker.sh` (`--seed 42 --only-psms`), + parse 1%-FDR target count. +- Compare against the locked baseline (PXD 14,755 / TMT 9,605 / Astral 36,715). +- One commit per change; revert in place if the hard gate fails. Keep reverts on + the branch as record (matches the project's iteration-shipping model). + +## Phases + +### Phase 0 — Diagnostic (measurement only; no code change) + +Determine, for each of PXD001819 and TMT, what the *current* run actually resolves +to. Output a table: + +| Dataset | Resolved instrument | Feature tolerance | Scoring tolerance | Calibration fired? | Dominant activation | +|---|---|---|---|---|---| + +Sources: +- The Rust run's stderr ("Param resolver: auto-detected ...", "instrument = ..."). +- `InstrumentType::is_high_resolution()` for the resolved instrument → 20 ppm vs 0.5 Da. +- The MassCalibrator log line (fired vs skipped, and the ` cvParam (FTMS/Orbitrap/ITMS) to know the *true* instrument. + +**Decision:** if PXD/TMT are high-res but resolving to the 0.5-Da low-res tolerance, +Phase 1a has the iter20 win available. If they're already on the high-res path, +Phase 1a yields ~0 and we proceed to Phase 1b / Phase 2. + +### Phase 1 — Config levers (low speed risk) + +**1a — Instrument / fragment tolerance.** If Phase 0 shows a low-res/high-res +mismatch on the *tolerance* path, fix instrument resolution so the 20-ppm +high-res branch engages in both `compute_psm_features` and the scoring tolerance. +Likely a fix to how `detect_instrument_type` feeds `InstrumentType::is_high_resolution()`, +or a default change. Bench-gate. + +**1b — Calibration.** If Phase 0 shows `--precursor-cal auto` skipping on PXD/TMT +(the historical guard skips when <200 confident PSMs), investigate the guard +threshold and whether calibration would add PSMs (it did on Astral). Bench-gate. + +### Phase 2 — Label-flip gap (hot-path; bench-gated, revert-ready) + +**2a — H2 peak-rank assignment.** Using the I5 artifacts (scan 41522, +`R.DPANLPWASLNIDIAIDSTGVFK.E`), find the first RANK_DIFF ion (theo_mz, rust_rank, +java_rank). Trace both peak-selection paths: +- Rust `nearest_peak_rank_in`: intensity-max in window, strict `>`. +- Java `getPeakByMass`: identify its tie-break (first peak / closest m/z / index order). +Make Rust match Java's rule. Re-run the I5 trace harness; confirm RANK_DIFF count +drops and (by the H2→H3 coupling) LOGPROB_DIFF drops proportionally. Bench-gate on +PXD + TMT + Astral. + +**2b — H3 log-prob indexing.** Only if 2a lands and the bench gate passes. Close +the residual same-rank/different-value cases (307 of 608 in the I5 data). Likely a +table-indexing or clamping difference in `rank_scorer.rs`. Bench-gate. + +H1 (ion enumeration) is deferred unless 2a/2b leave a clear ion-enumeration residual +in the re-run trace. + +### Phase 3 — Additive PIN features (safety net) + +Only if Phases 1–2 fall short of the stretch target. Add new PIN columns (no change +to existing column values): +- `MeanMatchedRank` — average `nearest_peak_rank` over matched b/y ions. +- `ScoreFractionTop1Split` — max single-split contribution / RawScore (requires + keeping the max split alongside the sum in `score_psm`). +- `ln(num_distinct)` — already computed for E-value; expose as a column. +Bench-gate; expected flat-to-small. + +## Risks & mitigations + +- **Phase 2 regresses Percolator** (n=9+ audit): bench-gate per dataset, revert in + place. The I5 trace re-run is the leading indicator (RANK_DIFF should drop) before + trusting the Percolator delta. +- **Phase 2 slows the hot path:** measure wall on all 3 datasets; revert if >3%. +- **+10% unreachable via scoring alone:** acknowledged up front; success criteria + treat +10% as a direction, not a revert-all gate. Worst case we close the + gap-to-Java and modestly beat it, and ship the net-positive subset. +- **Astral regression:** every change is checked against Astral, which already + wins — a change that helps PXD/TMT but breaks Astral is reverted. + +## Out of scope + +- Astral (already beats Java; only a regression guard here). +- Algorithmic rewrites of the GF DP or candidate enumeration. +- New scoring models / `.param` retraining. +- The README bench-table PR (#39, separate).