Project-Navi · Maurice Witten (blocksifrdev) · Jun 20, 2026 · Jun 21, 2026 · Jun 21, 2026 · github-actions
@@ -0,0 +1,49 @@
+## Summary
+
+Adds an optional CAIF-style Index Authority Receipt for ordvec benchmark evidence.
+
+The goal is to make ordvec's index-first retrieval evidence machine-readable: quality delta, bytes/vector, latency regime, benchmark scope, limitations, fallback conditions, and a deterministic receipt hash.
+
+## Why
+
+ordvec already has a strong index-first compute story: compressed ordinal/sign retrieval can preserve retrieval quality under stated benchmark scopes while reducing storage and latency.
+
+This PR adds a small evidence packet and verifier so downstream systems can answer:
+
+> Is this compressed/index-first retrieval path evidence-supported before dense compute for this stated workload scope?
+
+## What this includes
+
+- `docs/INDEX_AUTHORITY_RECEIPTS.md`
+- `examples/caif/trec-covid-sign-rq2.index-authority.json`
+- `tools/verify_index_authority.py`
+
+## What this does not do
+
+- Does not change Rust code
+- Does not change `Cargo.toml`
+- Does not add runtime dependencies
+- Does not add CI requirements
+- Does not claim new benchmark results
+- Does not add signing, key management, or deployment trust policy
+
+## Verification
+
+    python3 tools/verify_index_authority.py examples/caif/trec-covid-sign-rq2.index-authority.json
+
+Expected output includes:
+
+    decision: ALLOW_INDEX_FIRST
+    quality_within_bootstrap_noise: true
+    storage_reduction: 10.6667x
+    single_query_speedup: 105.6604x
+
+## Scope
+
+The example uses existing public README benchmark values and preserves the stated limitations around dataset, encoder, corpus size, batch/threading regime, HNSW comparison, and larger-corpus claims.
+
+## Framing
+
+Benchmarks should not only report performance.
+
+They should authorize compute paths within a defined evidence envelope.
@@ -49,3 +49,6 @@ venv/
 /.cache/ordvec-beir/
 /results/beir/*
 !/results/beir/.gitkeep
+
+# CortexTrace local runtime artifacts
+.cortextrace/
@@ -0,0 +1,48 @@
+# Index Authority Receipts for ordvec
+
+Index Authority Receipts are CAIF-style evidence packets for ordvec benchmark results.
+
+They make index-first retrieval evidence machine-readable.
+
+Instead of only asking whether a retrieval mode is faster, a receipt asks whether the benchmark evidence supports using a compressed/index-first retrieval path within a stated workload scope.
+
+## IFC
+
+Index-First Compute means a cheaper index representation is evaluated before more expensive dense compute.
+
+For ordvec, IFC can include RankQuant compressed scan, Bitmap candidate generation, SignBitmap candidate generation, or SignBitmap to RankQuant rerank.
+
+## CAIF
+
+Compute Authority Index Format describes whether a compute path is justified under a stated evidence envelope.
+
+A receipt records baseline mode, candidate mode, quality delta, storage reduction, latency profile, scope, limitations, fallback conditions, and a deterministic receipt hash.
+
+## Verify
+
+Run:
+
+    python3 tools/verify_index_authority.py examples/caif/trec-covid-sign-rq2.index-authority.json
+
+Expected output:
+
+    decision: ALLOW_INDEX_FIRST
+    mode: sign_to_rq2
+    baseline: flat_exact
+    quality_within_bootstrap_noise: true
+    storage_reduction: 10.6667x
+    single_query_speedup: 105.6604x
+
+## Non-goals
+
+This does not change Rust code, Cargo.toml, CI, runtime behavior, signing, key management, or deployment trust policy.
+
+It does not create new benchmark claims.
+
+It preserves the stated benchmark scope and limitations.
+
+## Principle
+
+Benchmarks should not only report performance.
+
+They should authorize compute paths within a defined evidence envelope.
@@ -0,0 +1,82 @@
+{
+  "schema": "ordvec.index_authority.v0.1",
+  "subject": {
+    "project": "ordvec",
+    "mode": "sign_to_rq2",
+    "version": "0.5.0"
+  },
+  "baseline": {
+    "mode": "flat_exact",
+    "bytes_per_vector": 4096
+  },
+  "ifc": {
+    "enabled": true,
+    "compute_path": [
+      "sign_bitmap_candidate_generation",
+      "rankquant_b2_rerank"
+    ],
+    "training_required": false,
+    "fit_required": false,
+    "graph_required": false,
+    "float_corpus_required_for_reported_path": false
+  },
+  "evidence": {
+    "dataset": "trec-covid",
+    "dataset_family": "BEIR",
+    "encoder": "Harrier-Q8 1024-d",
+    "corpus_size": 171332,
+    "metric": "nDCG@10",
+    "baseline_score": 0.7574,
+    "candidate_score": 0.7638,
+    "delta_vs_baseline": 0.0064,
+    "within_bootstrap_noise": true,
+    "evidence_source": "repository README benchmark table"
+  },
+  "economics": {
+    "candidate_bytes_per_vector": 384,
+    "storage_reduction_x": 10.6667,
+    "single_query_latency_ms": {
+      "baseline": 56.0,
+      "candidate": 0.53
+    },
+    "single_query_speedup_x": 105.6604
+  },
+  "decision": {
+    "recommended": "ALLOW_INDEX_FIRST",
+    "policy": {
+      "min_storage_reduction_x": 8.0,
+      "min_single_query_speedup_x": 10.0,
+      "require_quality_within_bootstrap_noise": true,
+      "require_scope": true,
+      "require_limitations": true
+    },
+    "fallback": [
+      "Use dense flat or ANN comparison when dataset, encoder, scale, or serving regime falls outside the stated evidence scope.",
+      "Require HNSW comparison for highly parallel threaded serving claims.",
+      "Require checked-in artifacts before extending the claim to larger corpora or alternate encoders."
+    ]
+  },
+  "scope": {
+    "claim_status": "public_repository_evidence",
+    "applies_to": [
+      "BEIR trec-covid",
+      "Harrier-Q8 1024-d embeddings",
+      "171332 document public benchmark run",
+      "single-query latency comparison against exact flat"
+    ],
+    "does_not_claim": [
+      "million-scale HNSW crossover",
+      "GPU bandwidth claims",
+      "alternate-encoder generalization",
+      "all serving regimes",
+      "dominance over HNSW in highly parallel threaded throughput"
+    ]
+  },
+  "limitations": [
+    "The compressed scan remains O(n), with a lower constant than dense flat.",
+    "HNSW wins the committed highly parallel threaded view.",
+    "The claim is scoped to the stated dataset, encoder, corpus size, and benchmark artifact.",
+    "Larger-corpus and alternate-encoder claims require checked-in run artifacts.",
+    "This receipt does not sign artifacts or manage deployment trust policy."
+  ]
+}
@@ -0,0 +1,9 @@
+{
+  "schema": "ordvec.index_authority.verifier_policy.v0.1",
+  "min_storage_reduction_x": 4.0,
+  "min_single_query_speedup_x": 1.25,
+  "max_quality_delta_loss": 0.02,
+  "require_scope": true,
+  "require_limitations": true,
+  "require_hnsw_comparison_for_parallel_claims": true
+}
@@ -0,0 +1,115 @@
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+VERIFY = ROOT / "tools" / "verify_index_authority.py"
+RECEIPT = ROOT / "examples" / "caif" / "trec-covid-sign-rq2.index-authority.json"
+POLICY = ROOT / "policies" / "index-authority.default-policy.json"
+
+def run_verify(path):
+    return subprocess.run(
+        [sys.executable, str(VERIFY), str(path), "--policy", str(POLICY)],
+        cwd=ROOT,
+        text=True,
+        capture_output=True,
+    )
+
+def test_valid_receipt_passes():
+    result = run_verify(RECEIPT)
+    assert result.returncode == 0, result.stderr + result.stdout
+
+def test_missing_required_field_rejected(tmp_path):
+    data = json.loads(RECEIPT.read_text())
+    data.pop("evidence")
+    bad = tmp_path / "missing-evidence.json"
+    bad.write_text(json.dumps(data))
+    result = run_verify(bad)
+    assert result.returncode != 0
+
+def test_metric_tampering_rejected(tmp_path):
+    data = json.loads(RECEIPT.read_text())
+    data["economics"]["storage_reduction_x"] = 999
+    bad = tmp_path / "tampered.json"
+    bad.write_text(json.dumps(data))
+    result = run_verify(bad)
+    assert result.returncode != 0
+
+def test_decision_mismatch_exit_code_3(tmp_path):
+    data = json.loads(RECEIPT.read_text())
+    data["decision"]["recommended"] = "DENY_UNSCOPED_CLAIM"
+    bad = tmp_path / "decision-mismatch.json"
+    bad.write_text(json.dumps(data))
+    result = run_verify(bad)
+    assert result.returncode == 3
+
+def test_ifc_disabled_rejected(tmp_path):
+    data = json.loads(RECEIPT.read_text())
+    data["ifc"]["enabled"] = False
+    bad = tmp_path / "ifc-disabled.json"
+    bad.write_text(json.dumps(data))
+    result = run_verify(bad)
+    assert result.returncode != 0
+    assert "ifc.enabled must be true" in result.stderr
+
+def test_ifc_empty_compute_path_rejected(tmp_path):
+    data = json.loads(RECEIPT.read_text())
+    data["ifc"]["compute_path"] = ""
+    bad = tmp_path / "ifc-empty-path.json"
+    bad.write_text(json.dumps(data))
+    result = run_verify(bad)
+    assert result.returncode != 0
+    assert "ifc.compute_path" in result.stderr
+
+def test_nan_metrics_rejected(tmp_path):
+    bad = tmp_path / "nan.json"
+    text = RECEIPT.read_text().replace('"storage_reduction_x":', '"storage_reduction_x": NaN, "old_storage_reduction_x":', 1)
+    bad.write_text(text)
+    result = run_verify(bad)
+    assert result.returncode != 0
+    assert "non-finite" in result.stderr
+
+def test_blank_scope_entries_rejected(tmp_path):
+    data = json.loads(RECEIPT.read_text())
+    data["scope"]["applies_to"] = [""]
+    data["scope"]["does_not_claim"] = ["  "]
+    data["limitations"] = [""]
+    bad = tmp_path / "blank-scope.json"
+    bad.write_text(json.dumps(data))
+    result = run_verify(bad)
+    assert result.returncode != 0
+
+def test_significant_quality_improvement_allowed(tmp_path):
+    data = json.loads(RECEIPT.read_text())
+    data["evidence"]["candidate_score"] = data["evidence"]["baseline_score"] + 0.05
+    data["evidence"]["delta_vs_baseline"] = 0.05
+    data["evidence"]["within_bootstrap_noise"] = False
+    data["decision"]["recommended"] = "ALLOW_INDEX_FIRST"
+    bad = tmp_path / "quality-improvement.json"
+    bad.write_text(json.dumps(data))
+    result = run_verify(bad)
+    assert result.returncode == 0, result.stderr + result.stdout
+
+def test_parallel_claim_requires_concrete_hnsw_evidence(tmp_path):
+    data = json.loads(RECEIPT.read_text())
+    data["scope"]["applies_to"] = ["highly parallel threaded serving"]
+    data["evidence"]["compared_against_hnsw"] = True
+    data["evidence"]["hnsw_comparison"] = {}
+    data["decision"]["recommended"] = "ALLOW_INDEX_FIRST"
+    bad = tmp_path / "empty-hnsw.json"
+    bad.write_text(json.dumps(data))
+    result = run_verify(bad)
+    assert result.returncode == 3
+    assert "REQUIRE_HNSW_COMPARISON" in result.stderr + result.stdout
+
+def test_single_query_production_does_not_require_hnsw(tmp_path):
+    data = json.loads(RECEIPT.read_text())
+    data["scope"]["applies_to"] = ["single-query production serving"]
+    data["evidence"].pop("hnsw_comparison", None)
+    data["evidence"].pop("compared_against_hnsw", None)
+    data["decision"]["recommended"] = "ALLOW_INDEX_FIRST"
+    bad = tmp_path / "single-query-prod.json"
+    bad.write_text(json.dumps(data))
+    result = run_verify(bad)
+    assert result.returncode == 0, result.stderr + result.stdout