From 0e3876d5e423988ad4752a4d4bfd5cb486bdc3b2 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 27 Jun 2026 20:21:05 +0200 Subject: [PATCH 1/2] buzhash64: add FastCDC-style normalized chunking Normalized chunking switches between a stricter and a looser cut mask around the target chunk size. This greatly tightens the chunk-size distribution (coefficient of variation ~0.9 -> ~0.3 in tests) and removes the dedup-hostile max-size-clamped chunks, with unchanged deduplication. chunker-params for buzhash64 gains a required 6th field, nc_level: buzhash64,chunk_min,chunk_max,chunk_mask,window_size,nc_level Use nc_level=2 for the new default, nc_level=0 to disable (then behavior is byte-identical to the previous single-mask chunker). buzhash (32bit) is untouched and stays bit-compatible with borg 1.x. The mask transition point (normal_size) defaults to a principled formula (target minus the expected loose-phase tail) so the mean stays near the target; it can be tuned via the normal_size constructor arg. scripts/chunker_bench.py: evidence harness used to measure chunk-size distribution, dedup ratio, throughput and shift-resilience. Measurements (before = nc_level 0, after = nc_level 2; both at the default params buzhash64,19,23,21,4095; measured with scripts/chunker_bench.py): 5 GiB of incompressible data (~2000-2700 chunks, statistically stable): before: CV 0.739, 49 max-size-clamped (8 MiB) chunks, 953 MB/s after: CV 0.311, 0 max-size-clamped chunks, 1024 MB/s Re-backup of a 2.5 GiB file after a few scattered single-byte edits (deduplication ratio; 0.5 = v2 fully deduplicated against v1, lower is better): 64 edits: before 0.5424 -> after 0.5235 320 edits: before 0.6791 -> after 0.6142 Normalized chunking deduplicates better after edits: removing the max-size-clamped chunks means a single-byte change invalidates much less data (about 36% less dedup overhead at 320 edits). Throughput was also consistently higher with nc_level=2 at this scale. Also: fix bug when computing the mask, one needs to use 1ULL instead of 1, so the shifting computation is done in a uint64, not in a 32bit int. Co-Authored-By: Claude Opus 4.8 --- docs/changes.rst | 8 + docs/usage/transfer.rst | 2 +- scripts/chunker_bench.py | 321 ++++++++++++++++++ src/borg/archiver/benchmark_cmd.py | 10 +- src/borg/archiver/completion_cmd.py | 2 +- src/borg/chunkers/__init__.py | 4 +- src/borg/chunkers/buzhash64.pyx | 59 +++- src/borg/constants.py | 6 +- src/borg/helpers/parseformat.py | 15 +- src/borg/testsuite/archiver/list_cmd_test.py | 2 +- src/borg/testsuite/chunkers/buzhash64_test.py | 6 +- 11 files changed, 414 insertions(+), 21 deletions(-) create mode 100644 scripts/chunker_bench.py diff --git a/docs/changes.rst b/docs/changes.rst index 566c3d4610..8caa0d0e8b 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -168,6 +168,14 @@ above. New features: +- buzhash64 chunker: add FastCDC-style normalized chunking and enable it by default + (``nc_level=2``). It switches between a stricter and a looser cut mask around the target + chunk size, which greatly tightens the chunk-size distribution (chunk-size variance / + coefficient of variation roughly cut by ~60% in tests) and removes the dedup-hostile + max-size-clamped chunks, at negligible throughput cost and with unchanged deduplication. + ``chunker-params`` for buzhash64 gains a required 6th field ``nc_level`` + (``buzhash64,chunk_min,chunk_max,chunk_mask,window_size,nc_level``). + buzhash (32bit) is unchanged and stays bit-compatible with borg 1.x. - repo-create: split ``--encryption`` into orthogonal options. ``--encryption`` now selects only the cipher / AE algorithm (``none``, ``authenticated``, ``aes256-ocb`` or ``chacha20-poly1305``), the new ``--id-hash`` selects the id hash function diff --git a/docs/usage/transfer.rst b/docs/usage/transfer.rst index 22e729debe..86c5716bde 100644 --- a/docs/usage/transfer.rst +++ b/docs/usage/transfer.rst @@ -55,7 +55,7 @@ locations and passphrases first: # The AEAD cipher does not matter (everything must be re-encrypted and # re-authenticated anyway); you could also choose -e chacha20-poly1305 -i blake3. $ borg repo-create -e aes256-ocb -i blake3 - $ export CHUNKER_PARAMS="buzhash64,19,23,21,4095" + $ export CHUNKER_PARAMS="buzhash64,19,23,21,4095,2" # 2. Check what and how much it would transfer: $ borg transfer --from-borg1 --chunker-params=$CHUNKER_PARAMS --dry-run diff --git a/scripts/chunker_bench.py b/scripts/chunker_bench.py new file mode 100644 index 0000000000..a90b66a04c --- /dev/null +++ b/scripts/chunker_bench.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python3 +""" +buzhash64 chunker evaluation harness. + +Purpose +------- +Establish an *evidence baseline* for the current buzhash64 chunker (and buzhash32 +for reference) so that any future change to buzhash64 can be judged against real +numbers instead of intuition. + +It measures, for a given chunker config and corpus: + + * chunk-size distribution: count, mean, stddev, coefficient of variation (CV), + and how many chunks were clamped at min_size / max_size, + * deduplication ratio: unique-chunk-bytes / total-bytes (lower is better dedup), + * throughput in MB/s, + * shift resilience: re-chunk a mutated copy (bytes inserted/deleted at random + offsets) and report what fraction of chunks (by content) survive. This is the + property content-defined chunking exists for; size-distribution changes can + help or hurt it, so we must watch it. + +Corpora +------- + --path FILE_OR_DIR use real data (a dir is concatenated, file order sorted) + --synthetic random:N N bytes of os.urandom (incompressible, worst case) + --synthetic lcg:N N bytes of a cheap LCG stream (deterministic) + --synthetic textish:N N bytes of low-entropy, repetitive ascii-ish data + +Examples +-------- + python scripts/chunker_bench.py --synthetic lcg:67108864 + python scripts/chunker_bench.py --path /usr/lib --max-bytes 268435456 + python scripts/chunker_bench.py --path ./some.tar --algo buzhash64 buzhash + +This script imports the *compiled* borg chunkers, so build borg first. +It does not modify borg in any way; it is a measurement tool only. +""" + +import argparse +import hashlib +import os +import random +import statistics +import sys +import time +from io import BytesIO + +from borg.chunkers import get_chunker +from borg.constants import CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE + + +def gen_synthetic(spec): + kind, _, rest = spec.partition(":") + if kind == "versioned": + # parsed below from the full spec (it has two numeric fields) + n = 0 + else: + n = int(rest) + if kind == "random": + return os.urandom(n) + if kind == "lcg": + a = bytearray(n) + x = 1 + for i in range(n): + x = (x * 1103515245 + 12345) & 0x7FFFFFFF + a[i] = x & 0xFF + return bytes(a) + if kind == "versioned": + # "versioned:N[:E]" -> corpus = v1 ++ v2, where v2 is v1 with E scattered single-byte + # inserts/deletes (default E=64). Models backing up a slightly-changed large file: the + # dedup ratio shows how much of v2 is re-deduplicated against v1, which is exactly what + # shift-resilient chunk boundaries (and normalized chunking) affect. + parts = spec.split(":") + n = int(parts[1]) + edits = int(parts[2]) if len(parts) > 2 else 64 + v1 = os.urandom(n) + v2 = mutate(v1, edits, random.Random(42)) + corpus = v1 + v2 + del v1, v2 + return corpus + if kind == "textish": + # low-entropy, repetitive: stresses buzhash window cancellation and + # tends to produce many min/max-clamped chunks. + words = [ + b"the ", + b"quick ", + b"brown ", + b"fox ", + b"jumps ", + b"over ", + b"lazy ", + b"dog ", + b"lorem ", + b"ipsum ", + b"dolor ", + b"sit ", + ] + rng = random.Random(1234) + out = bytearray() + while len(out) < n: + out += rng.choice(words) + return bytes(out[:n]) + raise SystemExit(f"unknown synthetic spec: {spec!r}") + + +def load_path(path, max_bytes): + if os.path.isfile(path): + with open(path, "rb") as f: + return f.read(max_bytes if max_bytes else -1) + buf = bytearray() + for root, _, files in os.walk(path): + for name in sorted(files): + fp = os.path.join(root, name) + try: + with open(fp, "rb") as f: + buf += f.read() + except OSError: + continue + if max_bytes and len(buf) >= max_bytes: + return bytes(buf[:max_bytes]) + return bytes(buf) + + +def chunk_stats(algo, data, min_exp, max_exp, mask_bits, win, nc_level=0, normal_size=0): + """Chunk data and return (sizes, hashes, chunking_time) without materializing chunk bytes. + + Memory-lean: only a size (int) and a sha256 digest are kept per chunk, so very large + corpora can be processed. key=None -> zero key (deterministic).""" + params = [min_exp, max_exp, mask_bits, win] + kw = dict(key=None, sparse=False) + if algo == "buzhash64": + params.append(nc_level) # nc_level is a positional buzhash64 param + kw["normal_size"] = normal_size + chunker = get_chunker(algo, *params, **kw) + sizes = [] + hashes = [] + for c in chunker.chunkify(BytesIO(data)): + if c.data is None: # hole / all-zero alloc chunk + n = c.meta["size"] + sizes.append(n) + hashes.append(hashlib.sha256(b"\0" * n).digest()) + else: + b = c.data + sizes.append(len(b)) + hashes.append(hashlib.sha256(b).digest()) + return sizes, hashes, getattr(chunker, "chunking_time", 0.0) + + +def mutate(data, n_edits, rng): + """Insert and delete a few single bytes at random offsets (boundary shift test).""" + b = bytearray(data) + for _ in range(n_edits): + pos = rng.randrange(len(b)) + if rng.random() < 0.5: + b.insert(pos, rng.randrange(256)) + else: + del b[pos] + return bytes(b) + + +def analyze(algo, data, params, shift_edits, rng, nc_level=0, normal_size=0): + min_exp, max_exp, mask_bits, win = params + min_size, max_size = 1 << min_exp, 1 << max_exp + + t0 = time.monotonic() + sizes, hashes, internal_t = chunk_stats(algo, data, *params, nc_level=nc_level, normal_size=normal_size) + wall = time.monotonic() - t0 + + # drop last chunk for distribution stats (it is a remainder, often < min) + dist_sizes = sizes[:-1] if len(sizes) > 1 else sizes + total = sum(sizes) + + mean = statistics.fmean(dist_sizes) if dist_sizes else 0 + stdev = statistics.pstdev(dist_sizes) if len(dist_sizes) > 1 else 0.0 + cv = (stdev / mean) if mean else 0.0 + min_clamped = sum(1 for s in dist_sizes if s == min_size) + max_clamped = sum(1 for s in dist_sizes if s == max_size) + + # dedup ratio: unique chunk content / total (lower = more dedup) + seen = set() + unique_bytes = 0 + for h, n in zip(hashes, sizes): + if h not in seen: + seen.add(h) + unique_bytes += n + dedup_ratio = unique_bytes / total if total else 0.0 + + # shift resilience: re-chunk a mutated copy, fraction of chunks (by content) that survive + shift_survival = None + if shift_edits: + mutated = mutate(data, shift_edits, rng) + _, mhashes, _ = chunk_stats(algo, mutated, *params, nc_level=nc_level, normal_size=normal_size) + del mutated + orig_set = set(hashes) + survived = sum(1 for h in mhashes if h in orig_set) + shift_survival = survived / len(mhashes) if mhashes else 0.0 + + mb = total / (1024 * 1024) + secs = internal_t or wall + label = algo if not nc_level else f"{algo}/nc{nc_level}" + return { + "algo": label, + "count": len(sizes), + "total_mb": mb, + "mean": mean, + "stdev": stdev, + "cv": cv, + "min_clamped": min_clamped, + "max_clamped": max_clamped, + "min_obs": min(dist_sizes) if dist_sizes else 0, + "max_obs": max(dist_sizes) if dist_sizes else 0, + "dedup_ratio": dedup_ratio, + "throughput_mbps": mb / secs if secs else float("inf"), + "shift_survival": shift_survival, + } + + +def fmt(r): + line = ( + f"{r['algo']:>13} " + f"n={r['count']:>6} " + f"mean={r['mean']/1024:8.1f}K " + f"stdev={r['stdev']/1024:8.1f}K " + f"CV={r['cv']:5.3f} " + f"min/max-clamp={r['min_clamped']:>4}/{r['max_clamped']:<4} " + f"dedup={r['dedup_ratio']:6.4f} " + f"{r['throughput_mbps']:7.1f} MB/s" + ) + if r["shift_survival"] is not None: + line += f" shift-survive={r['shift_survival']:6.4f}" + return line + + +def main(): + ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + src = ap.add_mutually_exclusive_group(required=True) + src.add_argument("--path", help="file or directory to use as corpus") + src.add_argument("--synthetic", help="random:N | lcg:N | textish:N") + ap.add_argument("--max-bytes", type=int, default=0, help="cap corpus size (0 = no cap)") + ap.add_argument( + "--algo", + nargs="+", + default=["buzhash64", "buzhash"], + help="chunker algos to compare (default: buzhash64 buzhash)", + ) + ap.add_argument("--min-exp", type=int, default=CHUNK_MIN_EXP) + ap.add_argument("--max-exp", type=int, default=CHUNK_MAX_EXP) + ap.add_argument("--mask-bits", type=int, default=HASH_MASK_BITS) + ap.add_argument("--window", type=int, default=HASH_WINDOW_SIZE) + ap.add_argument( + "--nc-level", + type=int, + default=2, + help="normalized chunking level for buzhash64; runs nc=0 AND this level (0 to disable)", + ) + ap.add_argument( + "--normal-size", + type=int, + default=0, + help="explicit NC transition size in bytes (0 = auto = min_size + 2**mask_bits)", + ) + ap.add_argument( + "--shift-edits", type=int, default=8, help="number of random insert/delete edits for shift test (0 to skip)" + ) + ap.add_argument("--repeat", type=int, default=1, help="repeat runs (throughput stability)") + ap.add_argument("--seed", type=int, default=0) + args = ap.parse_args() + + if args.synthetic: + data = gen_synthetic(args.synthetic) + corpus_desc = args.synthetic + else: + data = load_path(args.path, args.max_bytes) + corpus_desc = args.path + if args.max_bytes: + data = data[: args.max_bytes] + + params = (args.min_exp, args.max_exp, args.mask_bits, args.window) + + print(f"corpus: {corpus_desc} size: {len(data)/(1024*1024):.1f} MiB") + print( + f"params: min_exp={params[0]} max_exp={params[1]} mask_bits={params[2]} " + f"window={params[3]} (target ~{(1< 0: + variants.append((algo, args.nc_level)) + + for algo, nc in variants: + best_tput = 0.0 + last = None + for _ in range(args.repeat): + r = analyze( + algo, + data, + params, + args.shift_edits, + random.Random(args.seed), + nc_level=nc, + normal_size=args.normal_size, + ) + best_tput = max(best_tput, r["throughput_mbps"]) + last = r + last["throughput_mbps"] = best_tput # report best (least-noisy) throughput + print(fmt(last)) + + print("-" * 118) + print( + "notes: dedup<1.0 only if corpus has duplicate content; CV lower = tighter " + "size distribution; shift-survive higher = better." + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/borg/archiver/benchmark_cmd.py b/src/borg/archiver/benchmark_cmd.py index e448bbef0a..099cac8b36 100644 --- a/src/borg/archiver/benchmark_cmd.py +++ b/src/borg/archiver/benchmark_cmd.py @@ -199,8 +199,8 @@ def chunkit(ch): ), # note: the buzhash64 chunker creation is rather slow, so we must keep it in setup ( - "buzhash64,19,23,21,4095", - "ch = get_chunker('buzhash64', 19, 23, 21, 4095, sparse=False)", + "buzhash64,19,23,21,4095,2", + "ch = get_chunker('buzhash64', 19, 23, 21, 4095, 2, sparse=False)", "chunkit(ch)", locals(), ), @@ -211,7 +211,7 @@ def chunkit(ch): algo, _, algo_params = spec.partition(",") result["chunkers"].append({"algo": algo, "algo_params": algo_params, "size": size, "time": dt}) else: - print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s") + print(f"{spec:<26} {format_file_size(size):<10} {dt:.3f}s") from ..crypto.low_level import hmac_sha256, blake2b_256 import blake3 @@ -232,7 +232,7 @@ def chunkit(ch): if args.json: result["hashes"].append({"algo": spec, "size": size, "time": dt}) else: - print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s") + print(f"{spec:<26} {format_file_size(size):<10} {dt:.3f}s") from ..crypto.low_level import AES256_CTR_BLAKE2b, AES256_CTR_HMAC_SHA256 from ..crypto.low_level import AES256_OCB, CHACHA20_POLY1305 @@ -272,7 +272,7 @@ def chunkit(ch): if args.json: result["encryption"].append({"algo": spec, "size": size, "time": dt}) else: - print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s") + print(f"{spec:<26} {format_file_size(size):<10} {dt:.3f}s") if not args.json: print("Compression ====================================================") diff --git a/src/borg/archiver/completion_cmd.py b/src/borg/archiver/completion_cmd.py index 4b973fafce..1bce0cb15d 100644 --- a/src/borg/archiver/completion_cmd.py +++ b/src/borg/archiver/completion_cmd.py @@ -708,7 +708,7 @@ def do_completion(self, args): comp_spec_choices_str = " ".join(comp_spec_choices) # Chunker params choices (static list) - chunker_params_choices = ["default", "fixed,4194304", "buzhash,19,23,21,4095", "buzhash64,19,23,21,4095"] + chunker_params_choices = ["default", "fixed,4194304", "buzhash,19,23,21,4095", "buzhash64,19,23,21,4095,2"] chunker_params_choices_str = " ".join(chunker_params_choices) # Relative time marker choices (static list) diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py index dd3376985d..7282b5e8eb 100644 --- a/src/borg/chunkers/__init__.py +++ b/src/borg/chunkers/__init__.py @@ -17,7 +17,9 @@ def get_chunker(algo, *params, **kw): if algo == "buzhash": return Chunker(seed, *params, sparse=sparse) if algo == "buzhash64": - return ChunkerBuzHash64(bh64_key, *params, sparse=sparse) + # params is (chunk_min_exp, chunk_max_exp, hash_mask_bits, hash_window_size, nc_level); + # nc_level is passed positionally. normal_size is an optional tuning knob (0 = auto). + return ChunkerBuzHash64(bh64_key, *params, normal_size=kw.get("normal_size", 0), sparse=sparse) if algo == "fixed": return ChunkerFixed(*params, sparse=sparse) if algo == "fail": diff --git a/src/borg/chunkers/buzhash64.pyx b/src/borg/chunkers/buzhash64.pyx index a3c7bf1101..1a1ce8fc9c 100644 --- a/src/borg/chunkers/buzhash64.pyx +++ b/src/borg/chunkers/buzhash64.pyx @@ -109,6 +109,9 @@ cdef class ChunkerBuzHash64: It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks. """ cdef uint64_t chunk_mask + cdef uint64_t mask_s, mask_l # normalized chunking: strict / loose masks + cdef size_t normal_size # chunk length at which we switch mask_s -> mask_l + cdef int nc_level # normalized chunking level (0 = disabled) cdef uint64_t* table cdef uint8_t* data cdef object _fd # Python object for file descriptor @@ -121,7 +124,7 @@ cdef class ChunkerBuzHash64: cdef size_t reader_block_size cdef bint sparse - def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False): + def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, int nc_level=0, size_t normal_size=0, bint sparse=False): self.table = NULL self.data = NULL min_size = 1 << chunk_min_exp @@ -131,8 +134,29 @@ cdef class ChunkerBuzHash64: assert hash_window_size + min_size + 1 <= max_size, "too small max_size" self.window_size = hash_window_size - self.chunk_mask = (1 << hash_mask_bits) - 1 + self.chunk_mask = (1ULL << hash_mask_bits) - 1 self.min_size = min_size + # Normalized chunking (FastCDC-style): use a stricter mask (lower cut probability) until + # the chunk reaches its expected/normal size, then a looser mask (higher cut probability). + # This concentrates chunk sizes around the target and reduces chunk-size variance. + # nc_level == 0 disables it, keeping behavior byte-identical to the single-mask chunker. + assert nc_level >= 0 + assert hash_mask_bits - nc_level >= 1, "nc_level too large for hash_mask_bits" + assert hash_mask_bits + nc_level <= 48, "nc_level too large for hash_mask_bits" + self.nc_level = nc_level + if nc_level: + self.mask_s = (1ULL << (hash_mask_bits + nc_level)) - 1 + self.mask_l = (1ULL << (hash_mask_bits - nc_level)) - 1 + # normal_size is the chunk length at which we switch from the strict to the loose + # mask; it dominates the mean chunk size. The default is the nominal target size + # (1ULL << hash_mask_bits) minus the expected loose-phase tail (1ULL << (bits - nc_level)), + # which lands the mean close to the target instead of overshooting it. Pass an + # explicit normal_size to tune it further. + self.normal_size = normal_size if normal_size else ((1ULL << hash_mask_bits) - (1ULL << (hash_mask_bits - nc_level))) + else: + self.mask_s = self.chunk_mask + self.mask_l = self.chunk_mask + self.normal_size = 0 self.table = buzhash64_init_table(key) self.buf_size = max_size self.data = malloc(self.buf_size) @@ -196,10 +220,14 @@ cdef class ChunkerBuzHash64: cdef object process(self) except *: """Process the chunker's buffer and return the next chunk.""" - cdef uint64_t sum, chunk_mask = self.chunk_mask + cdef uint64_t sum, mask + cdef uint64_t mask_s = self.mask_s, mask_l = self.mask_l + cdef int nc_level = self.nc_level cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size + cdef size_t normal_size = self.normal_size, normal_pos cdef uint8_t* p cdef uint8_t* stop_at + cdef uint8_t* nc_stop cdef size_t did_bytes if self.done: @@ -232,11 +260,32 @@ cdef class ChunkerBuzHash64: self.remaining -= min_size sum = _buzhash64(self.data + self.position, window_size, self.table) - while self.remaining > window_size and (sum & chunk_mask) and not (self.eof and self.remaining <= window_size): + # Normalized chunking: pick the mask based on how far we are into the current chunk. + # While below normal_size use the strict mask (lower cut probability), afterward the + # loose mask (higher cut probability). The mask is re-evaluated at the top of every + # iteration, so the transition is honored exactly at normal_pos. When nc is disabled, + # mask_s == mask_l == chunk_mask and the normal_pos cap is not applied, so this reduces + # to the original single-mask behavior. + mask = mask_s + normal_pos = 0 + while True: + if nc_level: + normal_pos = self.last + normal_size + mask = mask_s if self.position < normal_pos else mask_l + + if not (self.remaining > window_size and (sum & mask) and not (self.eof and self.remaining <= window_size)): + break + p = self.data + self.position stop_at = p + self.remaining - window_size - while p < stop_at and (sum & chunk_mask): + if nc_level and self.position < normal_pos: + # do not scan past the strict->loose transition; re-evaluate the mask there + nc_stop = self.data + normal_pos + if nc_stop < stop_at: + stop_at = nc_stop + + while p < stop_at and (sum & mask): sum = _buzhash64_update(sum, p[0], p[window_size], window_size, self.table) p += 1 diff --git a/src/borg/constants.py b/src/borg/constants.py index 5c88b6b89e..319393ec30 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -111,9 +111,13 @@ HASH_WINDOW_SIZE = 0xFFF # 4095 B HASH_MASK_BITS = 21 # results in ~2 MiB chunks statistically +# buzhash64-only: normalized chunking level (0 disables it). buzhash (32bit) does not support this +# and must stay bit-compatible to borg 1.x, so it has no nc_level param. +NC_LEVEL = 2 # FastCDC-style normalized chunking: tightens chunk-size distribution (much lower variance) + # defaults, use --chunker-params to override CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) -CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) +CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE, NC_LEVEL) # chunker params for the items metadata stream, finer granularity ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index b8d9f89a9a..72183885fd 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -304,8 +304,11 @@ def ChunkerParams(s): return algo, block_size, header_size if algo == "default" and count == 1: # default return CHUNKER_PARAMS - if algo == CH_BUZHASH64 and count == 5: # buzhash64, chunk_min, chunk_max, chunk_mask, window_size - chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[1:]) + if algo == CH_BUZHASH64 and count == 6: + # buzhash64, chunk_min, chunk_max, chunk_mask, window_size, nc_level + # use nc_level 0 to disable normalized chunking. + chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[1:5]) + nc_level = int(params[5]) if not (chunk_min <= chunk_mask <= chunk_max): raise ArgumentTypeError("required: chunk_min <= chunk_mask <= chunk_max") if chunk_min < 6: @@ -313,8 +316,14 @@ def ChunkerParams(s): raise ArgumentTypeError("min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)") if chunk_max > 23: raise ArgumentTypeError("max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)") + # normalized chunking switches the mask at the target size; it needs room below and above + # the base mask bits (chunk_mask). nc_level 0 disables it. + if not (0 <= nc_level and chunk_mask - nc_level >= 1 and chunk_mask + nc_level <= 48): + raise ArgumentTypeError( + "required: 0 <= nc_level and 1 <= chunk_mask - nc_level and chunk_mask + nc_level <= 48" + ) # note that for buzhash64, there is no problem with even window_size. - return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size + return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size, nc_level # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash): if algo == CH_BUZHASH and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4 :]) diff --git a/src/borg/testsuite/archiver/list_cmd_test.py b/src/borg/testsuite/archiver/list_cmd_test.py index f9bb56e58a..d5ce605890 100644 --- a/src/borg/testsuite/archiver/list_cmd_test.py +++ b/src/borg/testsuite/archiver/list_cmd_test.py @@ -250,7 +250,7 @@ def test_fingerprint(archivers, request): assert fingerprints1["input/file2"] != fingerprints4["input/file2"] # Also try with buzhash64 - cmd(archiver, "create", "--chunker-params=buzhash64,10,23,16,4095", "test5", "input") + cmd(archiver, "create", "--chunker-params=buzhash64,10,23,16,4095,2", "test5", "input") output = cmd(archiver, "list", "test5", "--format={fingerprint} {path}{NL}") fingerprints5 = {} for line in output.splitlines(): diff --git a/src/borg/testsuite/chunkers/buzhash64_test.py b/src/borg/testsuite/chunkers/buzhash64_test.py index 0bbeb4d3d5..9b19448587 100644 --- a/src/borg/testsuite/chunkers/buzhash64_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_test.py @@ -110,15 +110,15 @@ def test_fuzz_bh64(worker): def rnd_key(): return os.urandom(32) - # decompose CHUNKER64_PARAMS = (algo, min_exp, max_exp, mask_bits, window_size) - algo, min_exp, max_exp, mask_bits, win_size = CHUNKER64_PARAMS + # decompose CHUNKER64_PARAMS = (algo, min_exp, max_exp, mask_bits, window_size, nc_level) + algo, min_exp, max_exp, mask_bits, win_size, nc_level = CHUNKER64_PARAMS assert algo == CH_BUZHASH64 # default chunker must be buzhash64 here keys = [b"\0" * 32] + [rnd_key() for _ in range(10)] sizes = [random.randint(1, 4 * 1024 * 1024) for _ in range(50)] for key in keys: - chunker = ChunkerBuzHash64(key, min_exp, max_exp, mask_bits, win_size) + chunker = ChunkerBuzHash64(key, min_exp, max_exp, mask_bits, win_size, nc_level) for size in sizes: # Random data data = os.urandom(size) From afa8189938e8ad58780900322f4a12aacb27aa5d Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 27 Jun 2026 22:47:52 +0200 Subject: [PATCH 2/2] fastcdc: add FastCDC chunker with a keyed Gear hash Add a new "fastcdc" content-defined chunker selectable via --chunker-params. It uses the FastCDC Gear rolling hash (fp = (fp << 1) + Gear[byte]), which is window-less and cheaper per byte than buzhash's cyclic-polynomial update, so it chunks noticeably faster (see "borg benchmark cpu" output), while producing the same chunk-size distribution and deduplication. The Gear table is keyed: it is derived from the repo id key via CSPRNG (own "fastcdc" domain), exactly like the buzhash64 table, so chunk cut points stay unpredictable without the key (anti-fingerprinting). It implements the same FastCDC techniques as buzhash64 (sub-minimum skipping, normalized chunking with a required nc_level, min/max clamping); the mask uses the high bits of the hash (Gear accumulates entropy there). chunker-params: "fastcdc,chunk_min,chunk_max,chunk_mask,nc_level" - there is no window field, because Gear is window-less. e.g. fastcdc,19,23,21,2 Also: borg benchmark cpu now measures the fastcdc chunker; tests in borg.testsuite.chunkers (golden vector, size distribution, keyed gear table, param parsing, slow fuzz); docs and changelog. Benchmarks (scripts/chunker_bench.py, buzhash64 vs fastcdc, both nc_level=2, incompressible data unless noted): 5 GiB, 2 MiB target (default params): buzhash64: CV 0.294, 1011 MB/s fastcdc: CV 0.295, 1313 MB/s (+30%) 64 MiB, 64 KiB target: buzhash64: CV 0.374, shift-resilience 0.9928, 963 MB/s fastcdc: CV 0.359, shift-resilience 0.9929, 1331 MB/s (+38%) Re-backup of a 2.5 GiB file after scattered single-byte edits (dedup ratio, 0.5 = v2 fully deduplicated, lower is better): 64 edits: buzhash64 0.5237, fastcdc 0.5236 320 edits: buzhash64 0.6133, fastcdc 0.6161 borg benchmark cpu, 1 GB: fastcdc 3.80s, buzhash 4.36s, buzhash64 8.13s, fixed 0.56s. Chunk-size distribution, deduplication and shift-resilience match buzhash64 within noise; fastcdc is consistently faster. Also: fix bug when computing the mask, one needs to use 1ULL instead of 1, so the shifting computation is done in a uint64, not in a 32bit int. Co-Authored-By: Claude Opus 4.8 --- .gitignore | 1 + docs/changes.rst | 6 + docs/global.rst.inc | 1 + docs/internals/data-structures.rst | 18 ++ scripts/chunker_bench.py | 9 +- setup.py | 3 + src/borg/archiver/benchmark_cmd.py | 2 + src/borg/archiver/completion_cmd.py | 8 +- src/borg/chunkers/__init__.py | 16 +- src/borg/chunkers/fastcdc.pyx | 285 ++++++++++++++++++++ src/borg/constants.py | 3 + src/borg/helpers/parseformat.py | 18 ++ src/borg/testsuite/chunkers/fastcdc_test.py | 161 +++++++++++ 13 files changed, 523 insertions(+), 8 deletions(-) create mode 100644 src/borg/chunkers/fastcdc.pyx create mode 100644 src/borg/testsuite/chunkers/fastcdc_test.py diff --git a/.gitignore b/.gitignore index f2f7461228..a2254bafd7 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ src/borg/crypto/low_level.c src/borg/item.c src/borg/chunkers/buzhash.c src/borg/chunkers/buzhash64.c +src/borg/chunkers/fastcdc.c src/borg/chunkers/reader.c src/borg/checksums.c src/borg/platform/darwin.c diff --git a/docs/changes.rst b/docs/changes.rst index 8caa0d0e8b..9aac15a9f5 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -176,6 +176,12 @@ New features: ``chunker-params`` for buzhash64 gains a required 6th field ``nc_level`` (``buzhash64,chunk_min,chunk_max,chunk_mask,window_size,nc_level``). buzhash (32bit) is unchanged and stays bit-compatible with borg 1.x. +- new ``fastcdc`` chunker: a FastCDC content-defined chunker using a window-less, keyed Gear + rolling hash (the gear table is derived from the repo's id key, like buzhash64, so cut points + stay unpredictable without the key). It supports the same normalized chunking as buzhash64 and + produces the same chunk-size distribution and deduplication, but chunks roughly 1.3-1.5x faster. + Select it via ``--chunker-params fastcdc,chunk_min,chunk_max,chunk_mask,nc_level`` (no window + field; e.g. ``fastcdc,19,23,21,2``). ``borg benchmark cpu`` now reports its throughput too. - repo-create: split ``--encryption`` into orthogonal options. ``--encryption`` now selects only the cipher / AE algorithm (``none``, ``authenticated``, ``aes256-ocb`` or ``chacha20-poly1305``), the new ``--id-hash`` selects the id hash function diff --git a/docs/global.rst.inc b/docs/global.rst.inc index a3c8df1cc8..196391a265 100644 --- a/docs/global.rst.inc +++ b/docs/global.rst.inc @@ -19,6 +19,7 @@ .. _OpenSSL: https://www.openssl.org/ .. _`Python 3`: https://www.python.org/ .. _Buzhash: https://en.wikipedia.org/wiki/Buzhash +.. _FastCDC: https://www.usenix.org/conference/atc16/technical-sessions/presentation/xia .. _msgpack: https://msgpack.org/ .. _`msgpack-python`: https://pypi.org/project/msgpack-python/ .. _llfuse: https://pypi.org/project/llfuse/ diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst index 0a1e86edb0..2f63218758 100644 --- a/docs/internals/data-structures.rst +++ b/docs/internals/data-structures.rst @@ -403,6 +403,8 @@ Borg has these chunkers: - "buzhash": variable, content-defined blocksize, uses a rolling hash computed by the Buzhash_ algorithm. - "buzhash64": similar to "buzhash", but improved 64bit implementation +- "fastcdc": variable, content-defined blocksize, uses the window-less, keyed + Gear rolling hash (FastCDC_); faster than buzhash, same deduplication. For some more general usage hints see also ``--chunker-params``. @@ -483,6 +485,22 @@ The buzhash table is cryptographically derived from secret key material. These changes should improve resistance against attacks and also solve some of the issues of the original (32bit / XORed table) implementation. +"fastcdc" chunker ++++++++++++++++++ + +FastCDC_ content-defined chunker using the Gear rolling hash. Unlike buzhash it +is window-less (each byte's influence simply decays out of the hash), so its +update is cheaper and it chunks noticeably faster, while producing the same +deduplication and (with normalized chunking) the same chunk-size distribution. + +Like "buzhash64", the Gear table is cryptographically derived from secret key +material, so chunk cut points are unpredictable without the key. + +``borg create --chunker-params fastcdc,CHUNK_MIN_EXP,CHUNK_MAX_EXP,HASH_MASK_BITS,NC_LEVEL`` + +There is no window size (Gear is window-less). NC_LEVEL is the normalized +chunking level (0 disables it); 2 is a good default. E.g.: ``fastcdc,19,23,21,2``. + .. _cache: The cache diff --git a/scripts/chunker_bench.py b/scripts/chunker_bench.py index a90b66a04c..1d439d0099 100644 --- a/scripts/chunker_bench.py +++ b/scripts/chunker_bench.py @@ -129,7 +129,10 @@ def chunk_stats(algo, data, min_exp, max_exp, mask_bits, win, nc_level=0, normal params = [min_exp, max_exp, mask_bits, win] kw = dict(key=None, sparse=False) if algo == "buzhash64": - params.append(nc_level) # nc_level is a positional buzhash64 param + params.append(nc_level) # nc_level is a positional param + kw["normal_size"] = normal_size + elif algo == "fastcdc": + params = [min_exp, max_exp, mask_bits, nc_level] # fastcdc is window-less kw["normal_size"] = normal_size chunker = get_chunker(algo, *params, **kw) sizes = [] @@ -285,11 +288,11 @@ def main(): print(f"shift test: {args.shift_edits} edits repeats: {args.repeat}") print("-" * 118) - # build (algo, nc_level) variants; for buzhash64 also run the requested NC level + # build (algo, nc_level) variants; for buzhash64/fastcdc also run the requested NC level variants = [] for algo in args.algo: variants.append((algo, 0)) - if algo == "buzhash64" and args.nc_level > 0: + if algo in ("buzhash64", "fastcdc") and args.nc_level > 0: variants.append((algo, args.nc_level)) for algo, nc in variants: diff --git a/setup.py b/setup.py index 19f1c92789..1ad76e8048 100644 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ crypto_legacy_ll_source = "src/borg/legacy/crypto/low_level.pyx" buzhash_source = "src/borg/chunkers/buzhash.pyx" buzhash64_source = "src/borg/chunkers/buzhash64.pyx" +fastcdc_source = "src/borg/chunkers/fastcdc.pyx" reader_source = "src/borg/chunkers/reader.pyx" hashindex_source = "src/borg/hashindex.pyx" item_source = "src/borg/item.pyx" @@ -73,6 +74,7 @@ crypto_legacy_ll_source, buzhash_source, buzhash64_source, + fastcdc_source, reader_source, hashindex_source, item_source, @@ -189,6 +191,7 @@ def lib_ext_kwargs(pc, prefix_env_var, lib_name, lib_pkg_name, pc_version, lib_s Extension("borg.item", [item_source], extra_compile_args=cflags), Extension("borg.chunkers.buzhash", [buzhash_source], extra_compile_args=cflags), Extension("borg.chunkers.buzhash64", [buzhash64_source], extra_compile_args=cflags), + Extension("borg.chunkers.fastcdc", [fastcdc_source], extra_compile_args=cflags), Extension("borg.chunkers.reader", [reader_source], extra_compile_args=cflags), ] diff --git a/src/borg/archiver/benchmark_cmd.py b/src/borg/archiver/benchmark_cmd.py index 099cac8b36..71615e6456 100644 --- a/src/borg/archiver/benchmark_cmd.py +++ b/src/borg/archiver/benchmark_cmd.py @@ -204,6 +204,8 @@ def chunkit(ch): "chunkit(ch)", locals(), ), + # fastcdc (window-less keyed gear hash); gear table creation is slow, keep it in setup + ("fastcdc,19,23,21,2", "ch = get_chunker('fastcdc', 19, 23, 21, 2, sparse=False)", "chunkit(ch)", locals()), ("fixed,1048576", "ch = get_chunker('fixed', 1048576, sparse=False)", "chunkit(ch)", locals()), ]: dt = timeit(func, setup, number=number_default, globals=vars) diff --git a/src/borg/archiver/completion_cmd.py b/src/borg/archiver/completion_cmd.py index 1bce0cb15d..6b73444913 100644 --- a/src/borg/archiver/completion_cmd.py +++ b/src/borg/archiver/completion_cmd.py @@ -708,7 +708,13 @@ def do_completion(self, args): comp_spec_choices_str = " ".join(comp_spec_choices) # Chunker params choices (static list) - chunker_params_choices = ["default", "fixed,4194304", "buzhash,19,23,21,4095", "buzhash64,19,23,21,4095,2"] + chunker_params_choices = [ + "default", + "fixed,4194304", + "buzhash,19,23,21,4095", + "buzhash64,19,23,21,4095,2", + "fastcdc,19,23,21,2", + ] chunker_params_choices_str = " ".join(chunker_params_choices) # Relative time marker choices (static list) diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py index 7282b5e8eb..461cc90d16 100644 --- a/src/borg/chunkers/__init__.py +++ b/src/borg/chunkers/__init__.py @@ -1,5 +1,6 @@ from .buzhash import Chunker from .buzhash64 import ChunkerBuzHash64 +from .fastcdc import ChunkerFastCDC from .failing import ChunkerFailing from .fixed import ChunkerFixed from .reader import * # noqa @@ -10,16 +11,23 @@ def get_chunker(algo, *params, **kw): sparse = kw.get("sparse", False) # key.chunk_seed only has 32 bits seed = key.chunk_seed if key is not None else 0 - # for buzhash64, we want a much longer key, so we derive it from the id key - bh64_key = ( - key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"\0" * 32 - ) if algo == "buzhash": return Chunker(seed, *params, sparse=sparse) if algo == "buzhash64": + # for buzhash64, we want a much longer key, so we derive it from the id key. # params is (chunk_min_exp, chunk_max_exp, hash_mask_bits, hash_window_size, nc_level); # nc_level is passed positionally. normal_size is an optional tuning knob (0 = auto). + bh64_key = ( + key.derive_key(salt=b"", domain=b"buzhash64", size=32, from_id_key=True) if key is not None else b"\0" * 32 + ) return ChunkerBuzHash64(bh64_key, *params, normal_size=kw.get("normal_size", 0), sparse=sparse) + if algo == "fastcdc": + # keyed gear table, derived from the id key (own domain). params is + # (chunk_min_exp, chunk_max_exp, hash_mask_bits, nc_level) - no window (Gear is window-less). + fc_key = ( + key.derive_key(salt=b"", domain=b"fastcdc", size=32, from_id_key=True) if key is not None else b"\0" * 32 + ) + return ChunkerFastCDC(fc_key, *params, normal_size=kw.get("normal_size", 0), sparse=sparse) if algo == "fixed": return ChunkerFixed(*params, sparse=sparse) if algo == "fail": diff --git a/src/borg/chunkers/fastcdc.pyx b/src/borg/chunkers/fastcdc.pyx new file mode 100644 index 0000000000..2248ff5406 --- /dev/null +++ b/src/borg/chunkers/fastcdc.pyx @@ -0,0 +1,285 @@ +# cython: language_level=3 + +import cython +import time + +from cpython.bytes cimport PyBytes_AsString +from libc.stdint cimport uint8_t, uint64_t +from libc.stdlib cimport malloc, free +from libc.string cimport memcpy, memmove, memset + +from ..crypto.low_level import CSPRNG + +from ..constants import CH_DATA, CH_ALLOC, CH_HOLE, zeros +from .reader import FileReader, Chunk + +# FastCDC content-defined chunker (Xia et al., USENIX ATC 2016). +# +# Differences vs. the buzhash64 chunker in this package: +# * It uses the Gear rolling hash: fp = (fp << 1) + Gear[byte]. This is a single shift, +# add and table lookup per byte (no window, no "remove" term), so it is cheaper than +# buzhash's cyclic-polynomial update. +# * The Gear table is keyed from a 256-bit key via the same CSPRNG used by buzhash64, so +# cut points are unpredictable without the key (anti-fingerprinting), just like buzhash64. +# * Because the Gear hash accumulates information in its HIGH bits (the low bits only depend +# on the most recent bytes), the cut-decision mask uses the high bits of the hash. +# +# It implements the same FastCDC techniques the buzhash64 chunker uses: sub-minimum cut-point +# skipping, normalized chunking (strict/loose mask around a "normal" size), and min/max clamping. + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef uint64_t* fastcdc_init_gear(bytes key) except NULL: + """Generate a keyed 256-entry, 64-bit Gear table deterministically from a 256-bit key.""" + rng = CSPRNG(key) + cdef bytes rnd = rng.random_bytes(2048) # 256 * sizeof(uint64_t) + cdef const uint8_t* rp = PyBytes_AsString(rnd) + cdef uint64_t* gear = malloc(2048) + if gear == NULL: + raise MemoryError("Failed to allocate fastcdc gear table") + cdef int i, j + cdef uint64_t v + for i in range(256): + v = 0 + for j in range(8): + v |= (rp[i * 8 + j]) << (8 * j) + gear[i] = v + return gear + + +cdef inline uint64_t _high_mask(int bits): + """A mask with one-bits in the most significant positions (Gear's strong bits).""" + if bits <= 0: + return 0 + if bits >= 64: + return 0xFFFFFFFFFFFFFFFF + return ((1 << bits) - 1) << (64 - bits) + + +cdef class ChunkerFastCDC: + """ + FastCDC content-defined chunker, variable chunk sizes, keyed Gear hash. + + Unlike the buzhash chunkers, Gear is window-less, so there is no hash_window_size parameter. + """ + cdef uint64_t chunk_mask + cdef uint64_t mask_s, mask_l # normalized chunking: strict / loose masks + cdef size_t normal_size # chunk length at which we switch mask_s -> mask_l + cdef int nc_level # normalized chunking level (0 = disabled) + cdef uint64_t* gear + cdef uint8_t* data + cdef object _fd + cdef int fh + cdef int done, eof + cdef size_t min_size, buf_size, remaining, position, last + cdef long long bytes_read, bytes_yielded + cdef readonly float chunking_time + cdef object file_reader + cdef size_t reader_block_size + cdef bint sparse + + def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int nc_level=0, size_t normal_size=0, bint sparse=False): + self.gear = NULL + self.data = NULL + min_size = 1 << chunk_min_exp + max_size = 1 << chunk_max_exp + assert max_size <= len(zeros) + assert min_size + 1 <= max_size, "too small max_size" + + self.chunk_mask = _high_mask(hash_mask_bits) + self.min_size = min_size + # Normalized chunking, identical structure to the buzhash64 chunker (see there), but with + # the mask one-bits placed in the high bits of the Gear hash. + assert nc_level >= 0 + assert hash_mask_bits - nc_level >= 1, "nc_level too large for hash_mask_bits" + assert hash_mask_bits + nc_level <= 48, "nc_level too large for hash_mask_bits" + self.nc_level = nc_level + if nc_level: + self.mask_s = _high_mask(hash_mask_bits + nc_level) + self.mask_l = _high_mask(hash_mask_bits - nc_level) + self.normal_size = normal_size if normal_size else ((1ULL << hash_mask_bits) - (1ULL << (hash_mask_bits - nc_level))) + else: + self.mask_s = self.chunk_mask + self.mask_l = self.chunk_mask + self.normal_size = 0 + self.gear = fastcdc_init_gear(key) + self.buf_size = max_size + self.data = malloc(self.buf_size) + if self.data == NULL: + raise MemoryError("Failed to allocate chunker buffer") + self.fh = -1 + self.done = 0 + self.eof = 0 + self.remaining = 0 + self.position = 0 + self.last = 0 + self.bytes_read = 0 + self.bytes_yielded = 0 + self._fd = None + self.chunking_time = 0.0 + self.reader_block_size = 1024 * 1024 + self.sparse = sparse + + def __dealloc__(self): + if self.gear != NULL: + free(self.gear) + self.gear = NULL + if self.data != NULL: + free(self.data) + self.data = NULL + + cdef int fill(self) except 0: + """Fill the chunker's buffer with more data.""" + cdef ssize_t n + cdef object chunk + + memmove(self.data, self.data + self.last, self.position + self.remaining - self.last) + self.position -= self.last + self.last = 0 + n = self.buf_size - self.position - self.remaining + + if self.eof or n == 0: + return 1 + + chunk = self.file_reader.read(n) + n = chunk.meta["size"] + + if n > 0: + if chunk.meta["allocation"] == CH_DATA: + memcpy(self.data + self.position + self.remaining, PyBytes_AsString(chunk.data), n) + else: + memset(self.data + self.position + self.remaining, 0, n) + self.remaining += n + self.bytes_read += n + else: + self.eof = 1 + return 1 + + cdef object process(self) except *: + """Process the chunker's buffer and return the next chunk.""" + cdef uint64_t fp = 0, mask, mask_s = self.mask_s, mask_l = self.mask_l + cdef int nc_level = self.nc_level + cdef size_t n, old_last, min_size = self.min_size + cdef size_t normal_size = self.normal_size, normal_pos, chunk_len, did + cdef uint8_t* p + cdef uint8_t* stop + cdef uint8_t* cut + cdef uint64_t* gear = self.gear + + if self.done: + if self.bytes_read == self.bytes_yielded: + raise StopIteration + else: + raise Exception("chunkifier byte count mismatch") + + # ensure at least min_size + 1 bytes are buffered, or we are at eof + while self.remaining < min_size + 1 and not self.eof: + if not self.fill(): + return None + + # at eof with only a remainder (< min_size + 1): emit it as the final chunk + if self.eof and self.remaining < min_size + 1: + self.done = 1 + if self.remaining: + old_last = self.last + self.position += self.remaining + self.last = self.position + n = self.last - old_last + self.remaining = 0 + self.bytes_yielded += n + return memoryview((self.data + old_last)[:n]) + else: + if self.bytes_read == self.bytes_yielded: + raise StopIteration + else: + raise Exception("chunkifier byte count mismatch") + + # skip the sub-minimum region (no cut allowed below min_size), then gear-scan + self.position += min_size + self.remaining -= min_size + fp = 0 + + while True: + chunk_len = self.position - self.last + mask = mask_s if (nc_level and chunk_len < normal_size) else mask_l + + if self.remaining == 0: + if self.eof: + break # cut at end of data + if not self.fill(): + return None + if self.remaining == 0: + break # buffer full -> chunk reached max_size -> forced cut + continue + + p = self.data + self.position + stop = p + self.remaining + if nc_level and chunk_len < normal_size: + # do not scan past the strict->loose transition; re-evaluate the mask there + normal_pos = self.last + normal_size + if (self.data + normal_pos) < stop: + stop = self.data + normal_pos + + cut = NULL + while p < stop: + fp = (fp << 1) + gear[p[0]] + if (fp & mask) == 0: + cut = p + break + p += 1 + + if cut != NULL: + p = cut + 1 # cut right after the byte that triggered the boundary + did = p - (self.data + self.position) + self.position += did + self.remaining -= did + break + else: + did = p - (self.data + self.position) + self.position += did + self.remaining -= did + + old_last = self.last + self.last = self.position + n = self.last - old_last + self.bytes_yielded += n + return memoryview((self.data + old_last)[:n]) + + def chunkify(self, fd, fh=-1, fmap=None): + self._fd = fd + self.fh = fh + self.file_reader = FileReader(fd=fd, fh=fh, read_size=self.reader_block_size, sparse=self.sparse, fmap=fmap) + self.done = 0 + self.remaining = 0 + self.bytes_read = 0 + self.bytes_yielded = 0 + self.position = 0 + self.last = 0 + self.eof = 0 + return self + + def __iter__(self): + return self + + def __next__(self): + started_chunking = time.monotonic() + data = self.process() + got = len(data) + if zeros.startswith(data): + data = None + allocation = CH_ALLOC + else: + allocation = CH_DATA + self.chunking_time += time.monotonic() - started_chunking + return Chunk(data, size=got, allocation=allocation) + + +def fastcdc_get_gear_table(bytes key): + """Get the keyed gear table generated from (for tests / inspection).""" + cdef uint64_t* gear = fastcdc_init_gear(key) + cdef int i + try: + return [gear[i] for i in range(256)] + finally: + free(gear) diff --git a/src/borg/constants.py b/src/borg/constants.py index 319393ec30..17187631d2 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -102,6 +102,7 @@ # chunker algorithms CH_BUZHASH = "buzhash" CH_BUZHASH64 = "buzhash64" +CH_FASTCDC = "fastcdc" CH_FIXED = "fixed" CH_FAIL = "fail" @@ -118,6 +119,8 @@ # defaults, use --chunker-params to override CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE, NC_LEVEL) +# fastcdc uses a window-less Gear hash, so it has no window_size parameter. +FASTCDC_PARAMS = (CH_FASTCDC, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, NC_LEVEL) # chunker params for the items metadata stream, finer granularity ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index 72183885fd..b5b2928d93 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -324,6 +324,24 @@ def ChunkerParams(s): ) # note that for buzhash64, there is no problem with even window_size. return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size, nc_level + if algo == CH_FASTCDC and count == 5: + # fastcdc, chunk_min, chunk_max, chunk_mask, nc_level + # fastcdc uses a window-less Gear hash, so there is no window_size field. + # nc_level is required; use nc_level 0 to disable normalized chunking. + chunk_min, chunk_max, chunk_mask = (int(p) for p in params[1:4]) + nc_level = int(params[4]) + if not (chunk_min <= chunk_mask <= chunk_max): + raise ArgumentTypeError("required: chunk_min <= chunk_mask <= chunk_max") + if chunk_min < 6: + # see comment in 'fixed' algo check + raise ArgumentTypeError("min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)") + if chunk_max > 23: + raise ArgumentTypeError("max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)") + if not (0 <= nc_level and chunk_mask - nc_level >= 1 and chunk_mask + nc_level <= 48): + raise ArgumentTypeError( + "required: 0 <= nc_level and 1 <= chunk_mask - nc_level and chunk_mask + nc_level <= 48" + ) + return CH_FASTCDC, chunk_min, chunk_max, chunk_mask, nc_level # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash): if algo == CH_BUZHASH and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4 :]) diff --git a/src/borg/testsuite/chunkers/fastcdc_test.py b/src/borg/testsuite/chunkers/fastcdc_test.py new file mode 100644 index 0000000000..cf06193a6e --- /dev/null +++ b/src/borg/testsuite/chunkers/fastcdc_test.py @@ -0,0 +1,161 @@ +from hashlib import sha256 +from io import BytesIO +import os +import random + +import pytest + +from . import cf, cf_expand +from ...chunkers import ChunkerFastCDC, get_chunker +from ...chunkers.fastcdc import fastcdc_get_gear_table +from ...constants import * # NOQA +from ...helpers import hex_to_bin + + +# from os.urandom(32) +key0 = hex_to_bin("ad9f89095817f0566337dc9ee292fcd59b70f054a8200151f1df5f21704824da") +key1 = hex_to_bin("f1088c7e9e6ae83557ad1558ff36c44a369ea719d1081c29684f52ffccb72cb8") + + +def H(data): + return sha256(data).digest() + + +def test_chunkpoints_fastcdc_unchanged(): + def twist(size): + x = 1 + a = bytearray(size) + for i in range(size): + x = (x * 1103515245 + 12345) & 0x7FFFFFFF + a[i] = x & 0xFF + return a + + data = twist(100000) + + runs = [] + for nc_level in (0, 2, 3): + for minexp in (4, 6, 7, 11, 12): + for maxexp in (15, 17): + if minexp >= maxexp: + continue + for maskbits in (4, 7, 10, 12): + if maskbits - nc_level < 1: # nc_level needs room below the base mask bits + continue + for key in (key0, key1): + fh = BytesIO(data) + chunker = ChunkerFastCDC(key, minexp, maxexp, maskbits, nc_level) + chunks = [H(c) for c in cf(chunker.chunkify(fh, -1))] + runs.append(H(b"".join(chunks))) + + # The "correct" hash below matches the existing chunker behavior. + # Future chunker optimizations must not change this, or existing repos will bloat. + overall_hash = H(b"".join(runs)) + print(overall_hash.hex()) + assert overall_hash == hex_to_bin("50d39b6f30214d78f665ff97a4800142cddcb6a7c5995e5d162f9c6dceb20cfe") + + +def test_fastcdc_chunksize_distribution(): + data = os.urandom(1048576) + min_exp, max_exp, mask, nc_level = 10, 16, 14, 2 # chunk size target 16 KiB, clip at 1 KiB and 64 KiB + chunker = ChunkerFastCDC(key0, min_exp, max_exp, mask, nc_level) + f = BytesIO(data) + chunks = cf(chunker.chunkify(f)) + del chunks[-1] # get rid of the last chunk, it can be smaller than 2**min_exp + chunk_sizes = [len(chunk) for chunk in chunks] + chunks_count = len(chunks) + min_chunksize_observed = min(chunk_sizes) + max_chunksize_observed = max(chunk_sizes) + min_count = sum(int(size == 2**min_exp) for size in chunk_sizes) + max_count = sum(int(size == 2**max_exp) for size in chunk_sizes) + print( + f"count: {chunks_count} min: {min_chunksize_observed} max: {max_chunksize_observed} " + f"min count: {min_count} max count: {max_count}" + ) + # usually there will about 64 chunks + assert 32 < chunks_count < 128 + # chunks always must be between min and max (clipping must work): + assert min_chunksize_observed >= 2**min_exp + assert max_chunksize_observed <= 2**max_exp + # most chunks should be cut due to the gear hash triggering, not due to clipping at min/max size: + assert min_count < 10 + assert max_count < 10 + + +def test_fastcdc_gear_table(): + # Test that the function returns a list of 256 integers + table0 = fastcdc_get_gear_table(key0) + assert len(table0) == 256 + for value in table0: + assert isinstance(value, int) + assert 0 <= value < 2**64 + + # deterministic (same key produces same table) + assert table0 == fastcdc_get_gear_table(key0) + + # different keys produce different tables + table1 = fastcdc_get_gear_table(key1) + assert table0 != table1 + + +def test_fastcdc_get_chunker(): + # without a key, get_chunker uses an all-zero key; chunking must still work and be deterministic + data = os.urandom(2 * 1024 * 1024) + a = cf_expand(get_chunker(*FASTCDC_PARAMS, key=None).chunkify(BytesIO(data))) + b = cf_expand(get_chunker(*FASTCDC_PARAMS, key=None).chunkify(BytesIO(data))) + assert a == b + assert b"".join(a) == data + + +def test_fastcdc_params_parsing(): + from argparse import ArgumentTypeError + + from ...helpers import ChunkerParams + + # fastcdc, chunk_min, chunk_max, chunk_mask, nc_level (no window field) + assert ChunkerParams("fastcdc,19,23,21,2") == (CH_FASTCDC, 19, 23, 21, 2) + assert ChunkerParams("fastcdc,10,23,16,0") == (CH_FASTCDC, 10, 23, 16, 0) + # a 6-field (buzhash64-style, with window) fastcdc must be rejected + with pytest.raises(ArgumentTypeError): + ChunkerParams("fastcdc,19,23,21,4095,2") + # nc_level out of range (chunk_mask - nc_level < 1) + with pytest.raises(ArgumentTypeError): + ChunkerParams("fastcdc,19,23,21,21") + # chunk_min <= chunk_mask <= chunk_max violated + with pytest.raises(ArgumentTypeError): + ChunkerParams("fastcdc,19,23,24,2") + + +@pytest.mark.skipif("BORG_TESTS_SLOW" not in os.environ, reason="slow tests not enabled, use BORG_TESTS_SLOW=1") +@pytest.mark.parametrize("worker", range(os.cpu_count() or 1)) +def test_fuzz_fastcdc(worker): + # Fuzz fastcdc with random and uniform data of misc. sizes and misc keys. + def rnd_key(): + return os.urandom(32) + + # decompose FASTCDC_PARAMS = (algo, min_exp, max_exp, mask_bits, nc_level) + algo, min_exp, max_exp, mask_bits, nc_level = FASTCDC_PARAMS + assert algo == CH_FASTCDC + + keys = [b"\0" * 32] + [rnd_key() for _ in range(10)] + sizes = [random.randint(1, 4 * 1024 * 1024) for _ in range(50)] + + for key in keys: + chunker = ChunkerFastCDC(key, min_exp, max_exp, mask_bits, nc_level) + for size in sizes: + # Random data + data = os.urandom(size) + with BytesIO(data) as bio: + parts = cf_expand(chunker.chunkify(bio)) + assert b"".join(parts) == data + + # All-same data (non-zero) + data = b"\x42" * size + with BytesIO(data) as bio: + parts = cf_expand(chunker.chunkify(bio)) + assert b"".join(parts) == data + + # All-zero data + data = b"\x00" * size + with BytesIO(data) as bio: + parts = cf_expand(chunker.chunkify(bio)) + assert b"".join(parts) == data