From 0e3876d5e423988ad4752a4d4bfd5cb486bdc3b2 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Sat, 27 Jun 2026 20:21:05 +0200
Subject: [PATCH 1/2] buzhash64: add FastCDC-style normalized chunking

Normalized chunking switches between a stricter and a looser cut mask
around the target chunk size. This greatly tightens the chunk-size
distribution (coefficient of variation ~0.9 -> ~0.3 in tests) and removes
the dedup-hostile max-size-clamped chunks, with unchanged deduplication.

chunker-params for buzhash64 gains a required 6th field, nc_level:

  buzhash64,chunk_min,chunk_max,chunk_mask,window_size,nc_level

Use nc_level=2 for the new default, nc_level=0 to disable (then behavior
is byte-identical to the previous single-mask chunker).

buzhash (32bit) is untouched and stays bit-compatible with borg 1.x.

The mask transition point (normal_size) defaults to a principled formula
(target minus the expected loose-phase tail) so the mean stays near the
target; it can be tuned via the normal_size constructor arg.

scripts/chunker_bench.py: evidence harness used to measure chunk-size
distribution, dedup ratio, throughput and shift-resilience.

Measurements (before = nc_level 0, after = nc_level 2; both at the default
params buzhash64,19,23,21,4095; measured with scripts/chunker_bench.py):

5 GiB of incompressible data (~2000-2700 chunks, statistically stable):

  before:  CV 0.739,  49 max-size-clamped (8 MiB) chunks,   953 MB/s
  after:   CV 0.311,   0 max-size-clamped chunks,          1024 MB/s

Re-backup of a 2.5 GiB file after a few scattered single-byte edits
(deduplication ratio; 0.5 = v2 fully deduplicated against v1, lower is
better):

   64 edits:  before 0.5424  ->  after 0.5235
  320 edits:  before 0.6791  ->  after 0.6142

Normalized chunking deduplicates better after edits: removing the
max-size-clamped chunks means a single-byte change invalidates much less
data (about 36% less dedup overhead at 320 edits). Throughput was also
consistently higher with nc_level=2 at this scale.

Also: fix bug when computing the mask, one needs to use 1ULL instead of
1, so the shifting computation is done in a uint64, not in a 32bit int.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 docs/changes.rst                              |   8 +
 docs/usage/transfer.rst                       |   2 +-
 scripts/chunker_bench.py                      | 321 ++++++++++++++++++
 src/borg/archiver/benchmark_cmd.py            |  10 +-
 src/borg/archiver/completion_cmd.py           |   2 +-
 src/borg/chunkers/__init__.py                 |   4 +-
 src/borg/chunkers/buzhash64.pyx               |  59 +++-
 src/borg/constants.py                         |   6 +-
 src/borg/helpers/parseformat.py               |  15 +-
 src/borg/testsuite/archiver/list_cmd_test.py  |   2 +-
 src/borg/testsuite/chunkers/buzhash64_test.py |   6 +-
 11 files changed, 414 insertions(+), 21 deletions(-)
 create mode 100644 scripts/chunker_bench.py

diff --git a/docs/changes.rst b/docs/changes.rst
index 566c3d4610..8caa0d0e8b 100644
--- a/docs/changes.rst
+++ b/docs/changes.rst
@@ -168,6 +168,14 @@ above.
 
 New features:
 
+- buzhash64 chunker: add FastCDC-style normalized chunking and enable it by default
+  (``nc_level=2``). It switches between a stricter and a looser cut mask around the target
+  chunk size, which greatly tightens the chunk-size distribution (chunk-size variance /
+  coefficient of variation roughly cut by ~60% in tests) and removes the dedup-hostile
+  max-size-clamped chunks, at negligible throughput cost and with unchanged deduplication.
+  ``chunker-params`` for buzhash64 gains a required 6th field ``nc_level``
+  (``buzhash64,chunk_min,chunk_max,chunk_mask,window_size,nc_level``).
+  buzhash (32bit) is unchanged and stays bit-compatible with borg 1.x.
 - repo-create: split ``--encryption`` into orthogonal options. ``--encryption`` now
   selects only the cipher / AE algorithm (``none``, ``authenticated``, ``aes256-ocb``
   or ``chacha20-poly1305``), the new ``--id-hash`` selects the id hash function
diff --git a/docs/usage/transfer.rst b/docs/usage/transfer.rst
index 22e729debe..86c5716bde 100644
--- a/docs/usage/transfer.rst
+++ b/docs/usage/transfer.rst
@@ -55,7 +55,7 @@ locations and passphrases first:
     # The AEAD cipher does not matter (everything must be re-encrypted and
     # re-authenticated anyway); you could also choose -e chacha20-poly1305 -i blake3.
     $ borg repo-create -e aes256-ocb -i blake3
-    $ export CHUNKER_PARAMS="buzhash64,19,23,21,4095"
+    $ export CHUNKER_PARAMS="buzhash64,19,23,21,4095,2"
 
     # 2. Check what and how much it would transfer:
     $ borg transfer --from-borg1 --chunker-params=$CHUNKER_PARAMS --dry-run
diff --git a/scripts/chunker_bench.py b/scripts/chunker_bench.py
new file mode 100644
index 0000000000..a90b66a04c
--- /dev/null
+++ b/scripts/chunker_bench.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python3
+"""
+buzhash64 chunker evaluation harness.
+
+Purpose
+-------
+Establish an *evidence baseline* for the current buzhash64 chunker (and buzhash32
+for reference) so that any future change to buzhash64 can be judged against real
+numbers instead of intuition.
+
+It measures, for a given chunker config and corpus:
+
+  * chunk-size distribution: count, mean, stddev, coefficient of variation (CV),
+    and how many chunks were clamped at min_size / max_size,
+  * deduplication ratio: unique-chunk-bytes / total-bytes (lower is better dedup),
+  * throughput in MB/s,
+  * shift resilience: re-chunk a mutated copy (bytes inserted/deleted at random
+    offsets) and report what fraction of chunks (by content) survive. This is the
+    property content-defined chunking exists for; size-distribution changes can
+    help or hurt it, so we must watch it.
+
+Corpora
+-------
+  --path FILE_OR_DIR   use real data (a dir is concatenated, file order sorted)
+  --synthetic random:N        N bytes of os.urandom (incompressible, worst case)
+  --synthetic lcg:N           N bytes of a cheap LCG stream (deterministic)
+  --synthetic textish:N       N bytes of low-entropy, repetitive ascii-ish data
+
+Examples
+--------
+  python scripts/chunker_bench.py --synthetic lcg:67108864
+  python scripts/chunker_bench.py --path /usr/lib --max-bytes 268435456
+  python scripts/chunker_bench.py --path ./some.tar --algo buzhash64 buzhash
+
+This script imports the *compiled* borg chunkers, so build borg first.
+It does not modify borg in any way; it is a measurement tool only.
+"""
+
+import argparse
+import hashlib
+import os
+import random
+import statistics
+import sys
+import time
+from io import BytesIO
+
+from borg.chunkers import get_chunker
+from borg.constants import CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE
+
+
+def gen_synthetic(spec):
+    kind, _, rest = spec.partition(":")
+    if kind == "versioned":
+        # parsed below from the full spec (it has two numeric fields)
+        n = 0
+    else:
+        n = int(rest)
+    if kind == "random":
+        return os.urandom(n)
+    if kind == "lcg":
+        a = bytearray(n)
+        x = 1
+        for i in range(n):
+            x = (x * 1103515245 + 12345) & 0x7FFFFFFF
+            a[i] = x & 0xFF
+        return bytes(a)
+    if kind == "versioned":
+        # "versioned:N[:E]" -> corpus = v1 ++ v2, where v2 is v1 with E scattered single-byte
+        # inserts/deletes (default E=64). Models backing up a slightly-changed large file: the
+        # dedup ratio shows how much of v2 is re-deduplicated against v1, which is exactly what
+        # shift-resilient chunk boundaries (and normalized chunking) affect.
+        parts = spec.split(":")
+        n = int(parts[1])
+        edits = int(parts[2]) if len(parts) > 2 else 64
+        v1 = os.urandom(n)
+        v2 = mutate(v1, edits, random.Random(42))
+        corpus = v1 + v2
+        del v1, v2
+        return corpus
+    if kind == "textish":
+        # low-entropy, repetitive: stresses buzhash window cancellation and
+        # tends to produce many min/max-clamped chunks.
+        words = [
+            b"the ",
+            b"quick ",
+            b"brown ",
+            b"fox ",
+            b"jumps ",
+            b"over ",
+            b"lazy ",
+            b"dog ",
+            b"lorem ",
+            b"ipsum ",
+            b"dolor ",
+            b"sit ",
+        ]
+        rng = random.Random(1234)
+        out = bytearray()
+        while len(out) < n:
+            out += rng.choice(words)
+        return bytes(out[:n])
+    raise SystemExit(f"unknown synthetic spec: {spec!r}")
+
+
+def load_path(path, max_bytes):
+    if os.path.isfile(path):
+        with open(path, "rb") as f:
+            return f.read(max_bytes if max_bytes else -1)
+    buf = bytearray()
+    for root, _, files in os.walk(path):
+        for name in sorted(files):
+            fp = os.path.join(root, name)
+            try:
+                with open(fp, "rb") as f:
+                    buf += f.read()
+            except OSError:
+                continue
+            if max_bytes and len(buf) >= max_bytes:
+                return bytes(buf[:max_bytes])
+    return bytes(buf)
+
+
+def chunk_stats(algo, data, min_exp, max_exp, mask_bits, win, nc_level=0, normal_size=0):
+    """Chunk data and return (sizes, hashes, chunking_time) without materializing chunk bytes.
+
+    Memory-lean: only a size (int) and a sha256 digest are kept per chunk, so very large
+    corpora can be processed. key=None -> zero key (deterministic)."""
+    params = [min_exp, max_exp, mask_bits, win]
+    kw = dict(key=None, sparse=False)
+    if algo == "buzhash64":
+        params.append(nc_level)  # nc_level is a positional buzhash64 param
+        kw["normal_size"] = normal_size
+    chunker = get_chunker(algo, *params, **kw)
+    sizes = []
+    hashes = []
+    for c in chunker.chunkify(BytesIO(data)):
+        if c.data is None:  # hole / all-zero alloc chunk
+            n = c.meta["size"]
+            sizes.append(n)
+            hashes.append(hashlib.sha256(b"\0" * n).digest())
+        else:
+            b = c.data
+            sizes.append(len(b))
+            hashes.append(hashlib.sha256(b).digest())
+    return sizes, hashes, getattr(chunker, "chunking_time", 0.0)
+
+
+def mutate(data, n_edits, rng):
+    """Insert and delete a few single bytes at random offsets (boundary shift test)."""
+    b = bytearray(data)
+    for _ in range(n_edits):
+        pos = rng.randrange(len(b))
+        if rng.random() < 0.5:
+            b.insert(pos, rng.randrange(256))
+        else:
+            del b[pos]
+    return bytes(b)
+
+
+def analyze(algo, data, params, shift_edits, rng, nc_level=0, normal_size=0):
+    min_exp, max_exp, mask_bits, win = params
+    min_size, max_size = 1 << min_exp, 1 << max_exp
+
+    t0 = time.monotonic()
+    sizes, hashes, internal_t = chunk_stats(algo, data, *params, nc_level=nc_level, normal_size=normal_size)
+    wall = time.monotonic() - t0
+
+    # drop last chunk for distribution stats (it is a remainder, often < min)
+    dist_sizes = sizes[:-1] if len(sizes) > 1 else sizes
+    total = sum(sizes)
+
+    mean = statistics.fmean(dist_sizes) if dist_sizes else 0
+    stdev = statistics.pstdev(dist_sizes) if len(dist_sizes) > 1 else 0.0
+    cv = (stdev / mean) if mean else 0.0
+    min_clamped = sum(1 for s in dist_sizes if s == min_size)
+    max_clamped = sum(1 for s in dist_sizes if s == max_size)
+
+    # dedup ratio: unique chunk content / total (lower = more dedup)
+    seen = set()
+    unique_bytes = 0
+    for h, n in zip(hashes, sizes):
+        if h not in seen:
+            seen.add(h)
+            unique_bytes += n
+    dedup_ratio = unique_bytes / total if total else 0.0
+
+    # shift resilience: re-chunk a mutated copy, fraction of chunks (by content) that survive
+    shift_survival = None
+    if shift_edits:
+        mutated = mutate(data, shift_edits, rng)
+        _, mhashes, _ = chunk_stats(algo, mutated, *params, nc_level=nc_level, normal_size=normal_size)
+        del mutated
+        orig_set = set(hashes)
+        survived = sum(1 for h in mhashes if h in orig_set)
+        shift_survival = survived / len(mhashes) if mhashes else 0.0
+
+    mb = total / (1024 * 1024)
+    secs = internal_t or wall
+    label = algo if not nc_level else f"{algo}/nc{nc_level}"
+    return {
+        "algo": label,
+        "count": len(sizes),
+        "total_mb": mb,
+        "mean": mean,
+        "stdev": stdev,
+        "cv": cv,
+        "min_clamped": min_clamped,
+        "max_clamped": max_clamped,
+        "min_obs": min(dist_sizes) if dist_sizes else 0,
+        "max_obs": max(dist_sizes) if dist_sizes else 0,
+        "dedup_ratio": dedup_ratio,
+        "throughput_mbps": mb / secs if secs else float("inf"),
+        "shift_survival": shift_survival,
+    }
+
+
+def fmt(r):
+    line = (
+        f"{r['algo']:>13}  "
+        f"n={r['count']:>6}  "
+        f"mean={r['mean']/1024:8.1f}K  "
+        f"stdev={r['stdev']/1024:8.1f}K  "
+        f"CV={r['cv']:5.3f}  "
+        f"min/max-clamp={r['min_clamped']:>4}/{r['max_clamped']:<4}  "
+        f"dedup={r['dedup_ratio']:6.4f}  "
+        f"{r['throughput_mbps']:7.1f} MB/s"
+    )
+    if r["shift_survival"] is not None:
+        line += f"  shift-survive={r['shift_survival']:6.4f}"
+    return line
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    src = ap.add_mutually_exclusive_group(required=True)
+    src.add_argument("--path", help="file or directory to use as corpus")
+    src.add_argument("--synthetic", help="random:N | lcg:N | textish:N")
+    ap.add_argument("--max-bytes", type=int, default=0, help="cap corpus size (0 = no cap)")
+    ap.add_argument(
+        "--algo",
+        nargs="+",
+        default=["buzhash64", "buzhash"],
+        help="chunker algos to compare (default: buzhash64 buzhash)",
+    )
+    ap.add_argument("--min-exp", type=int, default=CHUNK_MIN_EXP)
+    ap.add_argument("--max-exp", type=int, default=CHUNK_MAX_EXP)
+    ap.add_argument("--mask-bits", type=int, default=HASH_MASK_BITS)
+    ap.add_argument("--window", type=int, default=HASH_WINDOW_SIZE)
+    ap.add_argument(
+        "--nc-level",
+        type=int,
+        default=2,
+        help="normalized chunking level for buzhash64; runs nc=0 AND this level (0 to disable)",
+    )
+    ap.add_argument(
+        "--normal-size",
+        type=int,
+        default=0,
+        help="explicit NC transition size in bytes (0 = auto = min_size + 2**mask_bits)",
+    )
+    ap.add_argument(
+        "--shift-edits", type=int, default=8, help="number of random insert/delete edits for shift test (0 to skip)"
+    )
+    ap.add_argument("--repeat", type=int, default=1, help="repeat runs (throughput stability)")
+    ap.add_argument("--seed", type=int, default=0)
+    args = ap.parse_args()
+
+    if args.synthetic:
+        data = gen_synthetic(args.synthetic)
+        corpus_desc = args.synthetic
+    else:
+        data = load_path(args.path, args.max_bytes)
+        corpus_desc = args.path
+    if args.max_bytes:
+        data = data[: args.max_bytes]
+
+    params = (args.min_exp, args.max_exp, args.mask_bits, args.window)
+
+    print(f"corpus: {corpus_desc}  size: {len(data)/(1024*1024):.1f} MiB")
+    print(
+        f"params: min_exp={params[0]} max_exp={params[1]} mask_bits={params[2]} "
+        f"window={params[3]}  (target ~{(1<<params[2])/(1024*1024):.2f} MiB)"
+    )
+    print(f"shift test: {args.shift_edits} edits   repeats: {args.repeat}")
+    print("-" * 118)
+
+    # build (algo, nc_level) variants; for buzhash64 also run the requested NC level
+    variants = []
+    for algo in args.algo:
+        variants.append((algo, 0))
+        if algo == "buzhash64" and args.nc_level > 0:
+            variants.append((algo, args.nc_level))
+
+    for algo, nc in variants:
+        best_tput = 0.0
+        last = None
+        for _ in range(args.repeat):
+            r = analyze(
+                algo,
+                data,
+                params,
+                args.shift_edits,
+                random.Random(args.seed),
+                nc_level=nc,
+                normal_size=args.normal_size,
+            )
+            best_tput = max(best_tput, r["throughput_mbps"])
+            last = r
+        last["throughput_mbps"] = best_tput  # report best (least-noisy) throughput
+        print(fmt(last))
+
+    print("-" * 118)
+    print(
+        "notes: dedup<1.0 only if corpus has duplicate content; CV lower = tighter "
+        "size distribution; shift-survive higher = better."
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/borg/archiver/benchmark_cmd.py b/src/borg/archiver/benchmark_cmd.py
index e448bbef0a..099cac8b36 100644
--- a/src/borg/archiver/benchmark_cmd.py
+++ b/src/borg/archiver/benchmark_cmd.py
@@ -199,8 +199,8 @@ def chunkit(ch):
             ),
             # note: the buzhash64 chunker creation is rather slow, so we must keep it in setup
             (
-                "buzhash64,19,23,21,4095",
-                "ch = get_chunker('buzhash64', 19, 23, 21, 4095, sparse=False)",
+                "buzhash64,19,23,21,4095,2",
+                "ch = get_chunker('buzhash64', 19, 23, 21, 4095, 2, sparse=False)",
                 "chunkit(ch)",
                 locals(),
             ),
@@ -211,7 +211,7 @@ def chunkit(ch):
                 algo, _, algo_params = spec.partition(",")
                 result["chunkers"].append({"algo": algo, "algo_params": algo_params, "size": size, "time": dt})
             else:
-                print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s")
+                print(f"{spec:<26} {format_file_size(size):<10} {dt:.3f}s")
 
         from ..crypto.low_level import hmac_sha256, blake2b_256
         import blake3
@@ -232,7 +232,7 @@ def chunkit(ch):
             if args.json:
                 result["hashes"].append({"algo": spec, "size": size, "time": dt})
             else:
-                print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s")
+                print(f"{spec:<26} {format_file_size(size):<10} {dt:.3f}s")
 
         from ..crypto.low_level import AES256_CTR_BLAKE2b, AES256_CTR_HMAC_SHA256
         from ..crypto.low_level import AES256_OCB, CHACHA20_POLY1305
@@ -272,7 +272,7 @@ def chunkit(ch):
             if args.json:
                 result["encryption"].append({"algo": spec, "size": size, "time": dt})
             else:
-                print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s")
+                print(f"{spec:<26} {format_file_size(size):<10} {dt:.3f}s")
 
         if not args.json:
             print("Compression ====================================================")
diff --git a/src/borg/archiver/completion_cmd.py b/src/borg/archiver/completion_cmd.py
index 4b973fafce..1bce0cb15d 100644
--- a/src/borg/archiver/completion_cmd.py
+++ b/src/borg/archiver/completion_cmd.py
@@ -708,7 +708,7 @@ def do_completion(self, args):
         comp_spec_choices_str = " ".join(comp_spec_choices)
 
         # Chunker params choices (static list)
-        chunker_params_choices = ["default", "fixed,4194304", "buzhash,19,23,21,4095", "buzhash64,19,23,21,4095"]
+        chunker_params_choices = ["default", "fixed,4194304", "buzhash,19,23,21,4095", "buzhash64,19,23,21,4095,2"]
         chunker_params_choices_str = " ".join(chunker_params_choices)
 
         # Relative time marker choices (static list)
diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py
index dd3376985d..7282b5e8eb 100644
--- a/src/borg/chunkers/__init__.py
+++ b/src/borg/chunkers/__init__.py
@@ -17,7 +17,9 @@ def get_chunker(algo, *params, **kw):
     if algo == "buzhash":
         return Chunker(seed, *params, sparse=sparse)
     if algo == "buzhash64":
-        return ChunkerBuzHash64(bh64_key, *params, sparse=sparse)
+        # params is (chunk_min_exp, chunk_max_exp, hash_mask_bits, hash_window_size, nc_level);
+        # nc_level is passed positionally. normal_size is an optional tuning knob (0 = auto).
+        return ChunkerBuzHash64(bh64_key, *params, normal_size=kw.get("normal_size", 0), sparse=sparse)
     if algo == "fixed":
         return ChunkerFixed(*params, sparse=sparse)
     if algo == "fail":
diff --git a/src/borg/chunkers/buzhash64.pyx b/src/borg/chunkers/buzhash64.pyx
index a3c7bf1101..1a1ce8fc9c 100644
--- a/src/borg/chunkers/buzhash64.pyx
+++ b/src/borg/chunkers/buzhash64.pyx
@@ -109,6 +109,9 @@ cdef class ChunkerBuzHash64:
     It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks.
     """
     cdef uint64_t chunk_mask
+    cdef uint64_t mask_s, mask_l  # normalized chunking: strict / loose masks
+    cdef size_t normal_size       # chunk length at which we switch mask_s -> mask_l
+    cdef int nc_level             # normalized chunking level (0 = disabled)
     cdef uint64_t* table
     cdef uint8_t* data
     cdef object _fd  # Python object for file descriptor
@@ -121,7 +124,7 @@ cdef class ChunkerBuzHash64:
     cdef size_t reader_block_size
     cdef bint sparse
 
-    def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False):
+    def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, int nc_level=0, size_t normal_size=0, bint sparse=False):
         self.table = NULL
         self.data = NULL
         min_size = 1 << chunk_min_exp
@@ -131,8 +134,29 @@ cdef class ChunkerBuzHash64:
         assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
 
         self.window_size = hash_window_size
-        self.chunk_mask = (1 << hash_mask_bits) - 1
+        self.chunk_mask = (1ULL << hash_mask_bits) - 1
         self.min_size = min_size
+        # Normalized chunking (FastCDC-style): use a stricter mask (lower cut probability) until
+        # the chunk reaches its expected/normal size, then a looser mask (higher cut probability).
+        # This concentrates chunk sizes around the target and reduces chunk-size variance.
+        # nc_level == 0 disables it, keeping behavior byte-identical to the single-mask chunker.
+        assert nc_level >= 0
+        assert hash_mask_bits - nc_level >= 1, "nc_level too large for hash_mask_bits"
+        assert hash_mask_bits + nc_level <= 48, "nc_level too large for hash_mask_bits"
+        self.nc_level = nc_level
+        if nc_level:
+            self.mask_s = (1ULL << (hash_mask_bits + nc_level)) - 1
+            self.mask_l = (1ULL << (hash_mask_bits - nc_level)) - 1
+            # normal_size is the chunk length at which we switch from the strict to the loose
+            # mask; it dominates the mean chunk size. The default is the nominal target size
+            # (1ULL << hash_mask_bits) minus the expected loose-phase tail (1ULL << (bits - nc_level)),
+            # which lands the mean close to the target instead of overshooting it. Pass an
+            # explicit normal_size to tune it further.
+            self.normal_size = normal_size if normal_size else ((1ULL << hash_mask_bits) - (1ULL << (hash_mask_bits - nc_level)))
+        else:
+            self.mask_s = self.chunk_mask
+            self.mask_l = self.chunk_mask
+            self.normal_size = 0
         self.table = buzhash64_init_table(key)
         self.buf_size = max_size
         self.data = <uint8_t*>malloc(self.buf_size)
@@ -196,10 +220,14 @@ cdef class ChunkerBuzHash64:
 
     cdef object process(self) except *:
         """Process the chunker's buffer and return the next chunk."""
-        cdef uint64_t sum, chunk_mask = self.chunk_mask
+        cdef uint64_t sum, mask
+        cdef uint64_t mask_s = self.mask_s, mask_l = self.mask_l
+        cdef int nc_level = self.nc_level
         cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size
+        cdef size_t normal_size = self.normal_size, normal_pos
         cdef uint8_t* p
         cdef uint8_t* stop_at
+        cdef uint8_t* nc_stop
         cdef size_t did_bytes
 
         if self.done:
@@ -232,11 +260,32 @@ cdef class ChunkerBuzHash64:
         self.remaining -= min_size
         sum = _buzhash64(self.data + self.position, window_size, self.table)
 
-        while self.remaining > window_size and (sum & chunk_mask) and not (self.eof and self.remaining <= window_size):
+        # Normalized chunking: pick the mask based on how far we are into the current chunk.
+        # While below normal_size use the strict mask (lower cut probability), afterward the
+        # loose mask (higher cut probability). The mask is re-evaluated at the top of every
+        # iteration, so the transition is honored exactly at normal_pos. When nc is disabled,
+        # mask_s == mask_l == chunk_mask and the normal_pos cap is not applied, so this reduces
+        # to the original single-mask behavior.
+        mask = mask_s
+        normal_pos = 0
+        while True:
+            if nc_level:
+                normal_pos = self.last + normal_size
+                mask = mask_s if self.position < normal_pos else mask_l
+
+            if not (self.remaining > window_size and (sum & mask) and not (self.eof and self.remaining <= window_size)):
+                break
+
             p = self.data + self.position
             stop_at = p + self.remaining - window_size
 
-            while p < stop_at and (sum & chunk_mask):
+            if nc_level and self.position < normal_pos:
+                # do not scan past the strict->loose transition; re-evaluate the mask there
+                nc_stop = self.data + normal_pos
+                if nc_stop < stop_at:
+                    stop_at = nc_stop
+
+            while p < stop_at and (sum & mask):
                 sum = _buzhash64_update(sum, p[0], p[window_size], window_size, self.table)
                 p += 1
 
diff --git a/src/borg/constants.py b/src/borg/constants.py
index 5c88b6b89e..319393ec30 100644
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@@ -111,9 +111,13 @@
 HASH_WINDOW_SIZE = 0xFFF  # 4095 B
 HASH_MASK_BITS = 21  # results in ~2 MiB chunks statistically
 
+# buzhash64-only: normalized chunking level (0 disables it). buzhash (32bit) does not support this
+# and must stay bit-compatible to borg 1.x, so it has no nc_level param.
+NC_LEVEL = 2  # FastCDC-style normalized chunking: tightens chunk-size distribution (much lower variance)
+
 # defaults, use --chunker-params to override
 CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
-CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
+CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE, NC_LEVEL)
 
 # chunker params for the items metadata stream, finer granularity
 ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE)
diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py
index b8d9f89a9a..72183885fd 100644
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@@ -304,8 +304,11 @@ def ChunkerParams(s):
         return algo, block_size, header_size
     if algo == "default" and count == 1:  # default
         return CHUNKER_PARAMS
-    if algo == CH_BUZHASH64 and count == 5:  # buzhash64, chunk_min, chunk_max, chunk_mask, window_size
-        chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[1:])
+    if algo == CH_BUZHASH64 and count == 6:
+        # buzhash64, chunk_min, chunk_max, chunk_mask, window_size, nc_level
+        # use nc_level 0 to disable normalized chunking.
+        chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[1:5])
+        nc_level = int(params[5])
         if not (chunk_min <= chunk_mask <= chunk_max):
             raise ArgumentTypeError("required: chunk_min <= chunk_mask <= chunk_max")
         if chunk_min < 6:
@@ -313,8 +316,14 @@ def ChunkerParams(s):
             raise ArgumentTypeError("min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)")
         if chunk_max > 23:
             raise ArgumentTypeError("max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)")
+        # normalized chunking switches the mask at the target size; it needs room below and above
+        # the base mask bits (chunk_mask). nc_level 0 disables it.
+        if not (0 <= nc_level and chunk_mask - nc_level >= 1 and chunk_mask + nc_level <= 48):
+            raise ArgumentTypeError(
+                "required: 0 <= nc_level and 1 <= chunk_mask - nc_level and chunk_mask + nc_level <= 48"
+            )
         # note that for buzhash64, there is no problem with even window_size.
-        return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size
+        return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size, nc_level
     # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
     if algo == CH_BUZHASH and count == 5 or count == 4:  # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
         chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4 :])
diff --git a/src/borg/testsuite/archiver/list_cmd_test.py b/src/borg/testsuite/archiver/list_cmd_test.py
index f9bb56e58a..d5ce605890 100644
--- a/src/borg/testsuite/archiver/list_cmd_test.py
+++ b/src/borg/testsuite/archiver/list_cmd_test.py
@@ -250,7 +250,7 @@ def test_fingerprint(archivers, request):
     assert fingerprints1["input/file2"] != fingerprints4["input/file2"]
 
     # Also try with buzhash64
-    cmd(archiver, "create", "--chunker-params=buzhash64,10,23,16,4095", "test5", "input")
+    cmd(archiver, "create", "--chunker-params=buzhash64,10,23,16,4095,2", "test5", "input")
     output = cmd(archiver, "list", "test5", "--format={fingerprint} {path}{NL}")
     fingerprints5 = {}
     for line in output.splitlines():
diff --git a/src/borg/testsuite/chunkers/buzhash64_test.py b/src/borg/testsuite/chunkers/buzhash64_test.py
index 0bbeb4d3d5..9b19448587 100644
--- a/src/borg/testsuite/chunkers/buzhash64_test.py
+++ b/src/borg/testsuite/chunkers/buzhash64_test.py
@@ -110,15 +110,15 @@ def test_fuzz_bh64(worker):
     def rnd_key():
         return os.urandom(32)
 
-    # decompose CHUNKER64_PARAMS = (algo, min_exp, max_exp, mask_bits, window_size)
-    algo, min_exp, max_exp, mask_bits, win_size = CHUNKER64_PARAMS
+    # decompose CHUNKER64_PARAMS = (algo, min_exp, max_exp, mask_bits, window_size, nc_level)
+    algo, min_exp, max_exp, mask_bits, win_size, nc_level = CHUNKER64_PARAMS
     assert algo == CH_BUZHASH64  # default chunker must be buzhash64 here
 
     keys = [b"\0" * 32] + [rnd_key() for _ in range(10)]
     sizes = [random.randint(1, 4 * 1024 * 1024) for _ in range(50)]
 
     for key in keys:
-        chunker = ChunkerBuzHash64(key, min_exp, max_exp, mask_bits, win_size)
+        chunker = ChunkerBuzHash64(key, min_exp, max_exp, mask_bits, win_size, nc_level)
         for size in sizes:
             # Random data
             data = os.urandom(size)

From afa8189938e8ad58780900322f4a12aacb27aa5d Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Sat, 27 Jun 2026 22:47:52 +0200
Subject: [PATCH 2/2] fastcdc: add FastCDC chunker with a keyed Gear hash

Add a new "fastcdc" content-defined chunker selectable via --chunker-params.
It uses the FastCDC Gear rolling hash (fp = (fp << 1) + Gear[byte]), which is
window-less and cheaper per byte than buzhash's cyclic-polynomial update, so it
chunks noticeably faster (see "borg benchmark cpu" output), while producing
the same chunk-size distribution and deduplication.

The Gear table is keyed: it is derived from the repo id key via CSPRNG (own
"fastcdc" domain), exactly like the buzhash64 table, so chunk cut points stay
unpredictable without the key (anti-fingerprinting). It implements the same
FastCDC techniques as buzhash64 (sub-minimum skipping, normalized chunking with
a required nc_level, min/max clamping); the mask uses the high bits of the hash
(Gear accumulates entropy there).

chunker-params: "fastcdc,chunk_min,chunk_max,chunk_mask,nc_level" - there is no
window field, because Gear is window-less. e.g. fastcdc,19,23,21,2

Also: borg benchmark cpu now measures the fastcdc chunker; tests in
borg.testsuite.chunkers (golden vector, size distribution, keyed gear table,
param parsing, slow fuzz); docs and changelog.

Benchmarks (scripts/chunker_bench.py, buzhash64 vs fastcdc, both nc_level=2,
incompressible data unless noted):

  5 GiB, 2 MiB target (default params):
    buzhash64: CV 0.294, 1011 MB/s
    fastcdc:   CV 0.295, 1313 MB/s   (+30%)

  64 MiB, 64 KiB target:
    buzhash64: CV 0.374, shift-resilience 0.9928,  963 MB/s
    fastcdc:   CV 0.359, shift-resilience 0.9929, 1331 MB/s   (+38%)

  Re-backup of a 2.5 GiB file after scattered single-byte edits (dedup ratio,
  0.5 = v2 fully deduplicated, lower is better):
     64 edits:  buzhash64 0.5237, fastcdc 0.5236
    320 edits:  buzhash64 0.6133, fastcdc 0.6161

  borg benchmark cpu, 1 GB: fastcdc 3.80s, buzhash 4.36s, buzhash64 8.13s,
  fixed 0.56s.

Chunk-size distribution, deduplication and shift-resilience match buzhash64
within noise; fastcdc is consistently faster.

Also: fix bug when computing the mask, one needs to use 1ULL instead of
1, so the shifting computation is done in a uint64, not in a 32bit int.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .gitignore                                  |   1 +
 docs/changes.rst                            |   6 +
 docs/global.rst.inc                         |   1 +
 docs/internals/data-structures.rst          |  18 ++
 scripts/chunker_bench.py                    |   9 +-
 setup.py                                    |   3 +
 src/borg/archiver/benchmark_cmd.py          |   2 +
 src/borg/archiver/completion_cmd.py         |   8 +-
 src/borg/chunkers/__init__.py               |  16 +-
 src/borg/chunkers/fastcdc.pyx               | 285 ++++++++++++++++++++
 src/borg/constants.py                       |   3 +
 src/borg/helpers/parseformat.py             |  18 ++
 src/borg/testsuite/chunkers/fastcdc_test.py | 161 +++++++++++
 13 files changed, 523 insertions(+), 8 deletions(-)
 create mode 100644 src/borg/chunkers/fastcdc.pyx
 create mode 100644 src/borg/testsuite/chunkers/fastcdc_test.py

diff --git a/.gitignore b/.gitignore
index f2f7461228..a2254bafd7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ src/borg/crypto/low_level.c
 src/borg/item.c
 src/borg/chunkers/buzhash.c
 src/borg/chunkers/buzhash64.c
+src/borg/chunkers/fastcdc.c
 src/borg/chunkers/reader.c
 src/borg/checksums.c
 src/borg/platform/darwin.c
diff --git a/docs/changes.rst b/docs/changes.rst
index 8caa0d0e8b..9aac15a9f5 100644
--- a/docs/changes.rst
+++ b/docs/changes.rst
@@ -176,6 +176,12 @@ New features:
   ``chunker-params`` for buzhash64 gains a required 6th field ``nc_level``
   (``buzhash64,chunk_min,chunk_max,chunk_mask,window_size,nc_level``).
   buzhash (32bit) is unchanged and stays bit-compatible with borg 1.x.
+- new ``fastcdc`` chunker: a FastCDC content-defined chunker using a window-less, keyed Gear
+  rolling hash (the gear table is derived from the repo's id key, like buzhash64, so cut points
+  stay unpredictable without the key). It supports the same normalized chunking as buzhash64 and
+  produces the same chunk-size distribution and deduplication, but chunks roughly 1.3-1.5x faster.
+  Select it via ``--chunker-params fastcdc,chunk_min,chunk_max,chunk_mask,nc_level`` (no window
+  field; e.g. ``fastcdc,19,23,21,2``). ``borg benchmark cpu`` now reports its throughput too.
 - repo-create: split ``--encryption`` into orthogonal options. ``--encryption`` now
   selects only the cipher / AE algorithm (``none``, ``authenticated``, ``aes256-ocb``
   or ``chacha20-poly1305``), the new ``--id-hash`` selects the id hash function
diff --git a/docs/global.rst.inc b/docs/global.rst.inc
index a3c8df1cc8..196391a265 100644
--- a/docs/global.rst.inc
+++ b/docs/global.rst.inc
@@ -19,6 +19,7 @@
 .. _OpenSSL: https://www.openssl.org/
 .. _`Python 3`: https://www.python.org/
 .. _Buzhash: https://en.wikipedia.org/wiki/Buzhash
+.. _FastCDC: https://www.usenix.org/conference/atc16/technical-sessions/presentation/xia
 .. _msgpack: https://msgpack.org/
 .. _`msgpack-python`: https://pypi.org/project/msgpack-python/
 .. _llfuse: https://pypi.org/project/llfuse/
diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst
index 0a1e86edb0..2f63218758 100644
--- a/docs/internals/data-structures.rst
+++ b/docs/internals/data-structures.rst
@@ -403,6 +403,8 @@ Borg has these chunkers:
 - "buzhash": variable, content-defined blocksize, uses a rolling hash
   computed by the Buzhash_ algorithm.
 - "buzhash64": similar to "buzhash", but improved 64bit implementation
+- "fastcdc": variable, content-defined blocksize, uses the window-less, keyed
+  Gear rolling hash (FastCDC_); faster than buzhash, same deduplication.
 
 For some more general usage hints see also ``--chunker-params``.
 
@@ -483,6 +485,22 @@ The buzhash table is cryptographically derived from secret key material.
 These changes should improve resistance against attacks and also solve
 some of the issues of the original (32bit / XORed table) implementation.
 
+"fastcdc" chunker
++++++++++++++++++
+
+FastCDC_ content-defined chunker using the Gear rolling hash. Unlike buzhash it
+is window-less (each byte's influence simply decays out of the hash), so its
+update is cheaper and it chunks noticeably faster, while producing the same
+deduplication and (with normalized chunking) the same chunk-size distribution.
+
+Like "buzhash64", the Gear table is cryptographically derived from secret key
+material, so chunk cut points are unpredictable without the key.
+
+``borg create --chunker-params fastcdc,CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,NC_LEVEL``
+
+There is no window size (Gear is window-less). NC_LEVEL is the normalized
+chunking level (0 disables it); 2 is a good default. E.g.: ``fastcdc,19,23,21,2``.
+
 .. _cache:
 
 The cache
diff --git a/scripts/chunker_bench.py b/scripts/chunker_bench.py
index a90b66a04c..1d439d0099 100644
--- a/scripts/chunker_bench.py
+++ b/scripts/chunker_bench.py
@@ -129,7 +129,10 @@ def chunk_stats(algo, data, min_exp, max_exp, mask_bits, win, nc_level=0, normal
     params = [min_exp, max_exp, mask_bits, win]
     kw = dict(key=None, sparse=False)
     if algo == "buzhash64":
-        params.append(nc_level)  # nc_level is a positional buzhash64 param
+        params.append(nc_level)  # nc_level is a positional param
+        kw["normal_size"] = normal_size
+    elif algo == "fastcdc":
+        params = [min_exp, max_exp, mask_bits, nc_level]  # fastcdc is window-less
         kw["normal_size"] = normal_size
     chunker = get_chunker(algo, *params, **kw)
     sizes = []
@@ -285,11 +288,11 @@ def main():
     print(f"shift test: {args.shift_edits} edits   repeats: {args.repeat}")
     print("-" * 118)
 
-    # build (algo, nc_level) variants; for buzhash64 also run the requested NC level
+    # build (algo, nc_level) variants; for buzhash64/fastcdc also run the requested NC level
     variants = []
     for algo in args.algo:
         variants.append((algo, 0))
-        if algo == "buzhash64" and args.nc_level > 0:
+        if algo in ("buzhash64", "fastcdc") and args.nc_level > 0:
             variants.append((algo, args.nc_level))
 
     for algo, nc in variants:
diff --git a/setup.py b/setup.py
index 19f1c92789..1ad76e8048 100644
--- a/setup.py
+++ b/setup.py
@@ -56,6 +56,7 @@
 crypto_legacy_ll_source = "src/borg/legacy/crypto/low_level.pyx"
 buzhash_source = "src/borg/chunkers/buzhash.pyx"
 buzhash64_source = "src/borg/chunkers/buzhash64.pyx"
+fastcdc_source = "src/borg/chunkers/fastcdc.pyx"
 reader_source = "src/borg/chunkers/reader.pyx"
 hashindex_source = "src/borg/hashindex.pyx"
 item_source = "src/borg/item.pyx"
@@ -73,6 +74,7 @@
     crypto_legacy_ll_source,
     buzhash_source,
     buzhash64_source,
+    fastcdc_source,
     reader_source,
     hashindex_source,
     item_source,
@@ -189,6 +191,7 @@ def lib_ext_kwargs(pc, prefix_env_var, lib_name, lib_pkg_name, pc_version, lib_s
         Extension("borg.item", [item_source], extra_compile_args=cflags),
         Extension("borg.chunkers.buzhash", [buzhash_source], extra_compile_args=cflags),
         Extension("borg.chunkers.buzhash64", [buzhash64_source], extra_compile_args=cflags),
+        Extension("borg.chunkers.fastcdc", [fastcdc_source], extra_compile_args=cflags),
         Extension("borg.chunkers.reader", [reader_source], extra_compile_args=cflags),
     ]
 
diff --git a/src/borg/archiver/benchmark_cmd.py b/src/borg/archiver/benchmark_cmd.py
index 099cac8b36..71615e6456 100644
--- a/src/borg/archiver/benchmark_cmd.py
+++ b/src/borg/archiver/benchmark_cmd.py
@@ -204,6 +204,8 @@ def chunkit(ch):
                 "chunkit(ch)",
                 locals(),
             ),
+            # fastcdc (window-less keyed gear hash); gear table creation is slow, keep it in setup
+            ("fastcdc,19,23,21,2", "ch = get_chunker('fastcdc', 19, 23, 21, 2, sparse=False)", "chunkit(ch)", locals()),
             ("fixed,1048576", "ch = get_chunker('fixed', 1048576, sparse=False)", "chunkit(ch)", locals()),
         ]:
             dt = timeit(func, setup, number=number_default, globals=vars)
diff --git a/src/borg/archiver/completion_cmd.py b/src/borg/archiver/completion_cmd.py
index 1bce0cb15d..6b73444913 100644
--- a/src/borg/archiver/completion_cmd.py
+++ b/src/borg/archiver/completion_cmd.py
@@ -708,7 +708,13 @@ def do_completion(self, args):
         comp_spec_choices_str = " ".join(comp_spec_choices)
 
         # Chunker params choices (static list)
-        chunker_params_choices = ["default", "fixed,4194304", "buzhash,19,23,21,4095", "buzhash64,19,23,21,4095,2"]
+        chunker_params_choices = [
+            "default",
+            "fixed,4194304",
+            "buzhash,19,23,21,4095",
+            "buzhash64,19,23,21,4095,2",
+            "fastcdc,19,23,21,2",
+        ]
         chunker_params_choices_str = " ".join(chunker_params_choices)
 
         # Relative time marker choices (static list)
diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py
index 7282b5e8eb..461cc90d16 100644
--- a/src/borg/chunkers/__init__.py
+++ b/src/borg/chunkers/__init__.py
@@ -1,5 +1,6 @@
 from .buzhash import Chunker
 from .buzhash64 import ChunkerBuzHash64
+from .fastcdc import ChunkerFastCDC
 from .failing import ChunkerFailing
 from .fixed import ChunkerFixed
 from .reader import *  # noqa
@@ -10,16 +11,23 @@ def get_chunker(algo, *params, **kw):
     sparse = kw.get("sparse", False)
     # key.chunk_seed only has 32 bits
     seed = key.chunk_seed if key is not None else 0
-    # for buzhash64, we want a much longer key, so we derive it from the id key
-    bh64_key = (
-        key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"\0" * 32
-    )
     if algo == "buzhash":
         return Chunker(seed, *params, sparse=sparse)
     if algo == "buzhash64":
+        # for buzhash64, we want a much longer key, so we derive it from the id key.
         # params is (chunk_min_exp, chunk_max_exp, hash_mask_bits, hash_window_size, nc_level);
         # nc_level is passed positionally. normal_size is an optional tuning knob (0 = auto).
+        bh64_key = (
+            key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"\0" * 32
+        )
         return ChunkerBuzHash64(bh64_key, *params, normal_size=kw.get("normal_size", 0), sparse=sparse)
+    if algo == "fastcdc":
+        # keyed gear table, derived from the id key (own domain). params is
+        # (chunk_min_exp, chunk_max_exp, hash_mask_bits, nc_level) - no window (Gear is window-less).
+        fc_key = (
+            key.derive_key(salt=b"", domain=b"fastcdc", size=32, from_id_key=True) if key is not None else b"\0" * 32
+        )
+        return ChunkerFastCDC(fc_key, *params, normal_size=kw.get("normal_size", 0), sparse=sparse)
     if algo == "fixed":
         return ChunkerFixed(*params, sparse=sparse)
     if algo == "fail":
diff --git a/src/borg/chunkers/fastcdc.pyx b/src/borg/chunkers/fastcdc.pyx
new file mode 100644
index 0000000000..2248ff5406
--- /dev/null
+++ b/src/borg/chunkers/fastcdc.pyx
@@ -0,0 +1,285 @@
+# cython: language_level=3
+
+import cython
+import time
+
+from cpython.bytes cimport PyBytes_AsString
+from libc.stdint cimport uint8_t, uint64_t
+from libc.stdlib cimport malloc, free
+from libc.string cimport memcpy, memmove, memset
+
+from ..crypto.low_level import CSPRNG
+
+from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros
+from .reader import FileReader, Chunk
+
+# FastCDC content-defined chunker (Xia et al., USENIX ATC 2016).
+#
+# Differences vs. the buzhash64 chunker in this package:
+#  * It uses the Gear rolling hash: fp = (fp << 1) + Gear[byte]. This is a single shift,
+#    add and table lookup per byte (no window, no "remove" term), so it is cheaper than
+#    buzhash's cyclic-polynomial update.
+#  * The Gear table is keyed from a 256-bit key via the same CSPRNG used by buzhash64, so
+#    cut points are unpredictable without the key (anti-fingerprinting), just like buzhash64.
+#  * Because the Gear hash accumulates information in its HIGH bits (the low bits only depend
+#    on the most recent bytes), the cut-decision mask uses the high bits of the hash.
+#
+# It implements the same FastCDC techniques the buzhash64 chunker uses: sub-minimum cut-point
+# skipping, normalized chunking (strict/loose mask around a "normal" size), and min/max clamping.
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+cdef uint64_t* fastcdc_init_gear(bytes key) except NULL:
+    """Generate a keyed 256-entry, 64-bit Gear table deterministically from a 256-bit key."""
+    rng = CSPRNG(key)
+    cdef bytes rnd = rng.random_bytes(2048)  # 256 * sizeof(uint64_t)
+    cdef const uint8_t* rp = <const uint8_t*>PyBytes_AsString(rnd)
+    cdef uint64_t* gear = <uint64_t*>malloc(2048)
+    if gear == NULL:
+        raise MemoryError("Failed to allocate fastcdc gear table")
+    cdef int i, j
+    cdef uint64_t v
+    for i in range(256):
+        v = 0
+        for j in range(8):
+            v |= (<uint64_t>rp[i * 8 + j]) << (8 * j)
+        gear[i] = v
+    return gear
+
+
+cdef inline uint64_t _high_mask(int bits):
+    """A mask with <bits> one-bits in the most significant positions (Gear's strong bits)."""
+    if bits <= 0:
+        return 0
+    if bits >= 64:
+        return <uint64_t>0xFFFFFFFFFFFFFFFF
+    return ((<uint64_t>1 << bits) - 1) << (64 - bits)
+
+
+cdef class ChunkerFastCDC:
+    """
+    FastCDC content-defined chunker, variable chunk sizes, keyed Gear hash.
+
+    Unlike the buzhash chunkers, Gear is window-less, so there is no hash_window_size parameter.
+    """
+    cdef uint64_t chunk_mask
+    cdef uint64_t mask_s, mask_l  # normalized chunking: strict / loose masks
+    cdef size_t normal_size       # chunk length at which we switch mask_s -> mask_l
+    cdef int nc_level             # normalized chunking level (0 = disabled)
+    cdef uint64_t* gear
+    cdef uint8_t* data
+    cdef object _fd
+    cdef int fh
+    cdef int done, eof
+    cdef size_t min_size, buf_size, remaining, position, last
+    cdef long long bytes_read, bytes_yielded
+    cdef readonly float chunking_time
+    cdef object file_reader
+    cdef size_t reader_block_size
+    cdef bint sparse
+
+    def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int nc_level=0, size_t normal_size=0, bint sparse=False):
+        self.gear = NULL
+        self.data = NULL
+        min_size = 1 << chunk_min_exp
+        max_size = 1 << chunk_max_exp
+        assert max_size <= len(zeros)
+        assert min_size + 1 <= max_size, "too small max_size"
+
+        self.chunk_mask = _high_mask(hash_mask_bits)
+        self.min_size = min_size
+        # Normalized chunking, identical structure to the buzhash64 chunker (see there), but with
+        # the mask one-bits placed in the high bits of the Gear hash.
+        assert nc_level >= 0
+        assert hash_mask_bits - nc_level >= 1, "nc_level too large for hash_mask_bits"
+        assert hash_mask_bits + nc_level <= 48, "nc_level too large for hash_mask_bits"
+        self.nc_level = nc_level
+        if nc_level:
+            self.mask_s = _high_mask(hash_mask_bits + nc_level)
+            self.mask_l = _high_mask(hash_mask_bits - nc_level)
+            self.normal_size = normal_size if normal_size else ((1ULL << hash_mask_bits) - (1ULL << (hash_mask_bits - nc_level)))
+        else:
+            self.mask_s = self.chunk_mask
+            self.mask_l = self.chunk_mask
+            self.normal_size = 0
+        self.gear = fastcdc_init_gear(key)
+        self.buf_size = max_size
+        self.data = <uint8_t*>malloc(self.buf_size)
+        if self.data == NULL:
+            raise MemoryError("Failed to allocate chunker buffer")
+        self.fh = -1
+        self.done = 0
+        self.eof = 0
+        self.remaining = 0
+        self.position = 0
+        self.last = 0
+        self.bytes_read = 0
+        self.bytes_yielded = 0
+        self._fd = None
+        self.chunking_time = 0.0
+        self.reader_block_size = 1024 * 1024
+        self.sparse = sparse
+
+    def __dealloc__(self):
+        if self.gear != NULL:
+            free(self.gear)
+            self.gear = NULL
+        if self.data != NULL:
+            free(self.data)
+            self.data = NULL
+
+    cdef int fill(self) except 0:
+        """Fill the chunker's buffer with more data."""
+        cdef ssize_t n
+        cdef object chunk
+
+        memmove(self.data, self.data + self.last, self.position + self.remaining - self.last)
+        self.position -= self.last
+        self.last = 0
+        n = self.buf_size - self.position - self.remaining
+
+        if self.eof or n == 0:
+            return 1
+
+        chunk = self.file_reader.read(n)
+        n = chunk.meta["size"]
+
+        if n > 0:
+            if chunk.meta["allocation"] == CH_DATA:
+                memcpy(self.data + self.position + self.remaining, <const unsigned char*>PyBytes_AsString(chunk.data), n)
+            else:
+                memset(self.data + self.position + self.remaining, 0, n)
+            self.remaining += n
+            self.bytes_read += n
+        else:
+            self.eof = 1
+        return 1
+
+    cdef object process(self) except *:
+        """Process the chunker's buffer and return the next chunk."""
+        cdef uint64_t fp = 0, mask, mask_s = self.mask_s, mask_l = self.mask_l
+        cdef int nc_level = self.nc_level
+        cdef size_t n, old_last, min_size = self.min_size
+        cdef size_t normal_size = self.normal_size, normal_pos, chunk_len, did
+        cdef uint8_t* p
+        cdef uint8_t* stop
+        cdef uint8_t* cut
+        cdef uint64_t* gear = self.gear
+
+        if self.done:
+            if self.bytes_read == self.bytes_yielded:
+                raise StopIteration
+            else:
+                raise Exception("chunkifier byte count mismatch")
+
+        # ensure at least min_size + 1 bytes are buffered, or we are at eof
+        while self.remaining < min_size + 1 and not self.eof:
+            if not self.fill():
+                return None
+
+        # at eof with only a remainder (< min_size + 1): emit it as the final chunk
+        if self.eof and self.remaining < min_size + 1:
+            self.done = 1
+            if self.remaining:
+                old_last = self.last
+                self.position += self.remaining
+                self.last = self.position
+                n = self.last - old_last
+                self.remaining = 0
+                self.bytes_yielded += n
+                return memoryview((self.data + old_last)[:n])
+            else:
+                if self.bytes_read == self.bytes_yielded:
+                    raise StopIteration
+                else:
+                    raise Exception("chunkifier byte count mismatch")
+
+        # skip the sub-minimum region (no cut allowed below min_size), then gear-scan
+        self.position += min_size
+        self.remaining -= min_size
+        fp = 0
+
+        while True:
+            chunk_len = self.position - self.last
+            mask = mask_s if (nc_level and chunk_len < normal_size) else mask_l
+
+            if self.remaining == 0:
+                if self.eof:
+                    break  # cut at end of data
+                if not self.fill():
+                    return None
+                if self.remaining == 0:
+                    break  # buffer full -> chunk reached max_size -> forced cut
+                continue
+
+            p = self.data + self.position
+            stop = p + self.remaining
+            if nc_level and chunk_len < normal_size:
+                # do not scan past the strict->loose transition; re-evaluate the mask there
+                normal_pos = self.last + normal_size
+                if (self.data + normal_pos) < stop:
+                    stop = self.data + normal_pos
+
+            cut = NULL
+            while p < stop:
+                fp = (fp << 1) + gear[p[0]]
+                if (fp & mask) == 0:
+                    cut = p
+                    break
+                p += 1
+
+            if cut != NULL:
+                p = cut + 1  # cut right after the byte that triggered the boundary
+                did = p - (self.data + self.position)
+                self.position += did
+                self.remaining -= did
+                break
+            else:
+                did = p - (self.data + self.position)
+                self.position += did
+                self.remaining -= did
+
+        old_last = self.last
+        self.last = self.position
+        n = self.last - old_last
+        self.bytes_yielded += n
+        return memoryview((self.data + old_last)[:n])
+
+    def chunkify(self, fd, fh=-1, fmap=None):
+        self._fd = fd
+        self.fh = fh
+        self.file_reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size, sparse=self.sparse, fmap=fmap)
+        self.done = 0
+        self.remaining = 0
+        self.bytes_read = 0
+        self.bytes_yielded = 0
+        self.position = 0
+        self.last = 0
+        self.eof = 0
+        return self
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        started_chunking = time.monotonic()
+        data = self.process()
+        got = len(data)
+        if zeros.startswith(data):
+            data = None
+            allocation = CH_ALLOC
+        else:
+            allocation = CH_DATA
+        self.chunking_time += time.monotonic() - started_chunking
+        return Chunk(data, size=got, allocation=allocation)
+
+
+def fastcdc_get_gear_table(bytes key):
+    """Get the keyed gear table generated from <key> (for tests / inspection)."""
+    cdef uint64_t* gear = fastcdc_init_gear(key)
+    cdef int i
+    try:
+        return [gear[i] for i in range(256)]
+    finally:
+        free(gear)
diff --git a/src/borg/constants.py b/src/borg/constants.py
index 319393ec30..17187631d2 100644
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@@ -102,6 +102,7 @@
 # chunker algorithms
 CH_BUZHASH = "buzhash"
 CH_BUZHASH64 = "buzhash64"
+CH_FASTCDC = "fastcdc"
 CH_FIXED = "fixed"
 CH_FAIL = "fail"
 
@@ -118,6 +119,8 @@
 # defaults, use --chunker-params to override
 CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE)
 CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE, NC_LEVEL)
+# fastcdc uses a window-less Gear hash, so it has no window_size parameter.
+FASTCDC_PARAMS = (CH_FASTCDC, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, NC_LEVEL)
 
 # chunker params for the items metadata stream, finer granularity
 ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE)
diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py
index 72183885fd..b5b2928d93 100644
--- a/src/borg/helpers/parseformat.py
+++ b/src/borg/helpers/parseformat.py
@@ -324,6 +324,24 @@ def ChunkerParams(s):
             )
         # note that for buzhash64, there is no problem with even window_size.
         return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size, nc_level
+    if algo == CH_FASTCDC and count == 5:
+        # fastcdc, chunk_min, chunk_max, chunk_mask, nc_level
+        # fastcdc uses a window-less Gear hash, so there is no window_size field.
+        # nc_level is required; use nc_level 0 to disable normalized chunking.
+        chunk_min, chunk_max, chunk_mask = (int(p) for p in params[1:4])
+        nc_level = int(params[4])
+        if not (chunk_min <= chunk_mask <= chunk_max):
+            raise ArgumentTypeError("required: chunk_min <= chunk_mask <= chunk_max")
+        if chunk_min < 6:
+            # see comment in 'fixed' algo check
+            raise ArgumentTypeError("min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)")
+        if chunk_max > 23:
+            raise ArgumentTypeError("max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)")
+        if not (0 <= nc_level and chunk_mask - nc_level >= 1 and chunk_mask + nc_level <= 48):
+            raise ArgumentTypeError(
+                "required: 0 <= nc_level and 1 <= chunk_mask - nc_level and chunk_mask + nc_level <= 48"
+            )
+        return CH_FASTCDC, chunk_min, chunk_max, chunk_mask, nc_level
     # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash):
     if algo == CH_BUZHASH and count == 5 or count == 4:  # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size
         chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4 :])
diff --git a/src/borg/testsuite/chunkers/fastcdc_test.py b/src/borg/testsuite/chunkers/fastcdc_test.py
new file mode 100644
index 0000000000..cf06193a6e
--- /dev/null
+++ b/src/borg/testsuite/chunkers/fastcdc_test.py
@@ -0,0 +1,161 @@
+from hashlib import sha256
+from io import BytesIO
+import os
+import random
+
+import pytest
+
+from . import cf, cf_expand
+from ...chunkers import ChunkerFastCDC, get_chunker
+from ...chunkers.fastcdc import fastcdc_get_gear_table
+from ...constants import *  # NOQA
+from ...helpers import hex_to_bin
+
+
+# from os.urandom(32)
+key0 = hex_to_bin("ad9f89095817f0566337dc9ee292fcd59b70f054a8200151f1df5f21704824da")
+key1 = hex_to_bin("f1088c7e9e6ae83557ad1558ff36c44a369ea719d1081c29684f52ffccb72cb8")
+
+
+def H(data):
+    return sha256(data).digest()
+
+
+def test_chunkpoints_fastcdc_unchanged():
+    def twist(size):
+        x = 1
+        a = bytearray(size)
+        for i in range(size):
+            x = (x * 1103515245 + 12345) & 0x7FFFFFFF
+            a[i] = x & 0xFF
+        return a
+
+    data = twist(100000)
+
+    runs = []
+    for nc_level in (0, 2, 3):
+        for minexp in (4, 6, 7, 11, 12):
+            for maxexp in (15, 17):
+                if minexp >= maxexp:
+                    continue
+                for maskbits in (4, 7, 10, 12):
+                    if maskbits - nc_level < 1:  # nc_level needs room below the base mask bits
+                        continue
+                    for key in (key0, key1):
+                        fh = BytesIO(data)
+                        chunker = ChunkerFastCDC(key, minexp, maxexp, maskbits, nc_level)
+                        chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))]
+                        runs.append(H(b"".join(chunks)))
+
+    # The "correct" hash below matches the existing chunker behavior.
+    # Future chunker optimizations must not change this, or existing repos will bloat.
+    overall_hash = H(b"".join(runs))
+    print(overall_hash.hex())
+    assert overall_hash == hex_to_bin("50d39b6f30214d78f665ff97a4800142cddcb6a7c5995e5d162f9c6dceb20cfe")
+
+
+def test_fastcdc_chunksize_distribution():
+    data = os.urandom(1048576)
+    min_exp, max_exp, mask, nc_level = 10, 16, 14, 2  # chunk size target 16 KiB, clip at 1 KiB and 64 KiB
+    chunker = ChunkerFastCDC(key0, min_exp, max_exp, mask, nc_level)
+    f = BytesIO(data)
+    chunks = cf(chunker.chunkify(f))
+    del chunks[-1]  # get rid of the last chunk, it can be smaller than 2**min_exp
+    chunk_sizes = [len(chunk) for chunk in chunks]
+    chunks_count = len(chunks)
+    min_chunksize_observed = min(chunk_sizes)
+    max_chunksize_observed = max(chunk_sizes)
+    min_count = sum(int(size == 2**min_exp) for size in chunk_sizes)
+    max_count = sum(int(size == 2**max_exp) for size in chunk_sizes)
+    print(
+        f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} "
+        f"min count: {min_count} max count: {max_count}"
+    )
+    # usually there will about 64 chunks
+    assert 32 < chunks_count < 128
+    # chunks always must be between min and max (clipping must work):
+    assert min_chunksize_observed >= 2**min_exp
+    assert max_chunksize_observed <= 2**max_exp
+    # most chunks should be cut due to the gear hash triggering, not due to clipping at min/max size:
+    assert min_count < 10
+    assert max_count < 10
+
+
+def test_fastcdc_gear_table():
+    # Test that the function returns a list of 256 integers
+    table0 = fastcdc_get_gear_table(key0)
+    assert len(table0) == 256
+    for value in table0:
+        assert isinstance(value, int)
+        assert 0 <= value < 2**64
+
+    # deterministic (same key produces same table)
+    assert table0 == fastcdc_get_gear_table(key0)
+
+    # different keys produce different tables
+    table1 = fastcdc_get_gear_table(key1)
+    assert table0 != table1
+
+
+def test_fastcdc_get_chunker():
+    # without a key, get_chunker uses an all-zero key; chunking must still work and be deterministic
+    data = os.urandom(2 * 1024 * 1024)
+    a = cf_expand(get_chunker(*FASTCDC_PARAMS, key=None).chunkify(BytesIO(data)))
+    b = cf_expand(get_chunker(*FASTCDC_PARAMS, key=None).chunkify(BytesIO(data)))
+    assert a == b
+    assert b"".join(a) == data
+
+
+def test_fastcdc_params_parsing():
+    from argparse import ArgumentTypeError
+
+    from ...helpers import ChunkerParams
+
+    # fastcdc, chunk_min, chunk_max, chunk_mask, nc_level (no window field)
+    assert ChunkerParams("fastcdc,19,23,21,2") == (CH_FASTCDC, 19, 23, 21, 2)
+    assert ChunkerParams("fastcdc,10,23,16,0") == (CH_FASTCDC, 10, 23, 16, 0)
+    # a 6-field (buzhash64-style, with window) fastcdc must be rejected
+    with pytest.raises(ArgumentTypeError):
+        ChunkerParams("fastcdc,19,23,21,4095,2")
+    # nc_level out of range (chunk_mask - nc_level < 1)
+    with pytest.raises(ArgumentTypeError):
+        ChunkerParams("fastcdc,19,23,21,21")
+    # chunk_min <= chunk_mask <= chunk_max violated
+    with pytest.raises(ArgumentTypeError):
+        ChunkerParams("fastcdc,19,23,24,2")
+
+
+@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1")
+@pytest.mark.parametrize("worker", range(os.cpu_count() or 1))
+def test_fuzz_fastcdc(worker):
+    # Fuzz fastcdc with random and uniform data of misc. sizes and misc keys.
+    def rnd_key():
+        return os.urandom(32)
+
+    # decompose FASTCDC_PARAMS = (algo, min_exp, max_exp, mask_bits, nc_level)
+    algo, min_exp, max_exp, mask_bits, nc_level = FASTCDC_PARAMS
+    assert algo == CH_FASTCDC
+
+    keys = [b"\0" * 32] + [rnd_key() for _ in range(10)]
+    sizes = [random.randint(1, 4 * 1024 * 1024) for _ in range(50)]
+
+    for key in keys:
+        chunker = ChunkerFastCDC(key, min_exp, max_exp, mask_bits, nc_level)
+        for size in sizes:
+            # Random data
+            data = os.urandom(size)
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            assert b"".join(parts) == data
+
+            # All-same data (non-zero)
+            data = b"\x42" * size
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            assert b"".join(parts) == data
+
+            # All-zero data
+            data = b"\x00" * size
+            with BytesIO(data) as bio:
+                parts = cf_expand(chunker.chunkify(bio))
+            assert b"".join(parts) == data