From 0e3876d5e423988ad4752a4d4bfd5cb486bdc3b2 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 27 Jun 2026 20:21:05 +0200 Subject: [PATCH] buzhash64: add FastCDC-style normalized chunking Normalized chunking switches between a stricter and a looser cut mask around the target chunk size. This greatly tightens the chunk-size distribution (coefficient of variation ~0.9 -> ~0.3 in tests) and removes the dedup-hostile max-size-clamped chunks, with unchanged deduplication. chunker-params for buzhash64 gains a required 6th field, nc_level: buzhash64,chunk_min,chunk_max,chunk_mask,window_size,nc_level Use nc_level=2 for the new default, nc_level=0 to disable (then behavior is byte-identical to the previous single-mask chunker). buzhash (32bit) is untouched and stays bit-compatible with borg 1.x. The mask transition point (normal_size) defaults to a principled formula (target minus the expected loose-phase tail) so the mean stays near the target; it can be tuned via the normal_size constructor arg. scripts/chunker_bench.py: evidence harness used to measure chunk-size distribution, dedup ratio, throughput and shift-resilience. Measurements (before = nc_level 0, after = nc_level 2; both at the default params buzhash64,19,23,21,4095; measured with scripts/chunker_bench.py): 5 GiB of incompressible data (~2000-2700 chunks, statistically stable): before: CV 0.739, 49 max-size-clamped (8 MiB) chunks, 953 MB/s after: CV 0.311, 0 max-size-clamped chunks, 1024 MB/s Re-backup of a 2.5 GiB file after a few scattered single-byte edits (deduplication ratio; 0.5 = v2 fully deduplicated against v1, lower is better): 64 edits: before 0.5424 -> after 0.5235 320 edits: before 0.6791 -> after 0.6142 Normalized chunking deduplicates better after edits: removing the max-size-clamped chunks means a single-byte change invalidates much less data (about 36% less dedup overhead at 320 edits). Throughput was also consistently higher with nc_level=2 at this scale. Also: fix bug when computing the mask, one needs to use 1ULL instead of 1, so the shifting computation is done in a uint64, not in a 32bit int. Co-Authored-By: Claude Opus 4.8 --- docs/changes.rst | 8 + docs/usage/transfer.rst | 2 +- scripts/chunker_bench.py | 321 ++++++++++++++++++ src/borg/archiver/benchmark_cmd.py | 10 +- src/borg/archiver/completion_cmd.py | 2 +- src/borg/chunkers/__init__.py | 4 +- src/borg/chunkers/buzhash64.pyx | 59 +++- src/borg/constants.py | 6 +- src/borg/helpers/parseformat.py | 15 +- src/borg/testsuite/archiver/list_cmd_test.py | 2 +- src/borg/testsuite/chunkers/buzhash64_test.py | 6 +- 11 files changed, 414 insertions(+), 21 deletions(-) create mode 100644 scripts/chunker_bench.py diff --git a/docs/changes.rst b/docs/changes.rst index 566c3d4610..8caa0d0e8b 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -168,6 +168,14 @@ above. New features: +- buzhash64 chunker: add FastCDC-style normalized chunking and enable it by default + (``nc_level=2``). It switches between a stricter and a looser cut mask around the target + chunk size, which greatly tightens the chunk-size distribution (chunk-size variance / + coefficient of variation roughly cut by ~60% in tests) and removes the dedup-hostile + max-size-clamped chunks, at negligible throughput cost and with unchanged deduplication. + ``chunker-params`` for buzhash64 gains a required 6th field ``nc_level`` + (``buzhash64,chunk_min,chunk_max,chunk_mask,window_size,nc_level``). + buzhash (32bit) is unchanged and stays bit-compatible with borg 1.x. - repo-create: split ``--encryption`` into orthogonal options. ``--encryption`` now selects only the cipher / AE algorithm (``none``, ``authenticated``, ``aes256-ocb`` or ``chacha20-poly1305``), the new ``--id-hash`` selects the id hash function diff --git a/docs/usage/transfer.rst b/docs/usage/transfer.rst index 22e729debe..86c5716bde 100644 --- a/docs/usage/transfer.rst +++ b/docs/usage/transfer.rst @@ -55,7 +55,7 @@ locations and passphrases first: # The AEAD cipher does not matter (everything must be re-encrypted and # re-authenticated anyway); you could also choose -e chacha20-poly1305 -i blake3. $ borg repo-create -e aes256-ocb -i blake3 - $ export CHUNKER_PARAMS="buzhash64,19,23,21,4095" + $ export CHUNKER_PARAMS="buzhash64,19,23,21,4095,2" # 2. Check what and how much it would transfer: $ borg transfer --from-borg1 --chunker-params=$CHUNKER_PARAMS --dry-run diff --git a/scripts/chunker_bench.py b/scripts/chunker_bench.py new file mode 100644 index 0000000000..a90b66a04c --- /dev/null +++ b/scripts/chunker_bench.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python3 +""" +buzhash64 chunker evaluation harness. + +Purpose +------- +Establish an *evidence baseline* for the current buzhash64 chunker (and buzhash32 +for reference) so that any future change to buzhash64 can be judged against real +numbers instead of intuition. + +It measures, for a given chunker config and corpus: + + * chunk-size distribution: count, mean, stddev, coefficient of variation (CV), + and how many chunks were clamped at min_size / max_size, + * deduplication ratio: unique-chunk-bytes / total-bytes (lower is better dedup), + * throughput in MB/s, + * shift resilience: re-chunk a mutated copy (bytes inserted/deleted at random + offsets) and report what fraction of chunks (by content) survive. This is the + property content-defined chunking exists for; size-distribution changes can + help or hurt it, so we must watch it. + +Corpora +------- + --path FILE_OR_DIR use real data (a dir is concatenated, file order sorted) + --synthetic random:N N bytes of os.urandom (incompressible, worst case) + --synthetic lcg:N N bytes of a cheap LCG stream (deterministic) + --synthetic textish:N N bytes of low-entropy, repetitive ascii-ish data + +Examples +-------- + python scripts/chunker_bench.py --synthetic lcg:67108864 + python scripts/chunker_bench.py --path /usr/lib --max-bytes 268435456 + python scripts/chunker_bench.py --path ./some.tar --algo buzhash64 buzhash + +This script imports the *compiled* borg chunkers, so build borg first. +It does not modify borg in any way; it is a measurement tool only. +""" + +import argparse +import hashlib +import os +import random +import statistics +import sys +import time +from io import BytesIO + +from borg.chunkers import get_chunker +from borg.constants import CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE + + +def gen_synthetic(spec): + kind, _, rest = spec.partition(":") + if kind == "versioned": + # parsed below from the full spec (it has two numeric fields) + n = 0 + else: + n = int(rest) + if kind == "random": + return os.urandom(n) + if kind == "lcg": + a = bytearray(n) + x = 1 + for i in range(n): + x = (x * 1103515245 + 12345) & 0x7FFFFFFF + a[i] = x & 0xFF + return bytes(a) + if kind == "versioned": + # "versioned:N[:E]" -> corpus = v1 ++ v2, where v2 is v1 with E scattered single-byte + # inserts/deletes (default E=64). Models backing up a slightly-changed large file: the + # dedup ratio shows how much of v2 is re-deduplicated against v1, which is exactly what + # shift-resilient chunk boundaries (and normalized chunking) affect. + parts = spec.split(":") + n = int(parts[1]) + edits = int(parts[2]) if len(parts) > 2 else 64 + v1 = os.urandom(n) + v2 = mutate(v1, edits, random.Random(42)) + corpus = v1 + v2 + del v1, v2 + return corpus + if kind == "textish": + # low-entropy, repetitive: stresses buzhash window cancellation and + # tends to produce many min/max-clamped chunks. + words = [ + b"the ", + b"quick ", + b"brown ", + b"fox ", + b"jumps ", + b"over ", + b"lazy ", + b"dog ", + b"lorem ", + b"ipsum ", + b"dolor ", + b"sit ", + ] + rng = random.Random(1234) + out = bytearray() + while len(out) < n: + out += rng.choice(words) + return bytes(out[:n]) + raise SystemExit(f"unknown synthetic spec: {spec!r}") + + +def load_path(path, max_bytes): + if os.path.isfile(path): + with open(path, "rb") as f: + return f.read(max_bytes if max_bytes else -1) + buf = bytearray() + for root, _, files in os.walk(path): + for name in sorted(files): + fp = os.path.join(root, name) + try: + with open(fp, "rb") as f: + buf += f.read() + except OSError: + continue + if max_bytes and len(buf) >= max_bytes: + return bytes(buf[:max_bytes]) + return bytes(buf) + + +def chunk_stats(algo, data, min_exp, max_exp, mask_bits, win, nc_level=0, normal_size=0): + """Chunk data and return (sizes, hashes, chunking_time) without materializing chunk bytes. + + Memory-lean: only a size (int) and a sha256 digest are kept per chunk, so very large + corpora can be processed. key=None -> zero key (deterministic).""" + params = [min_exp, max_exp, mask_bits, win] + kw = dict(key=None, sparse=False) + if algo == "buzhash64": + params.append(nc_level) # nc_level is a positional buzhash64 param + kw["normal_size"] = normal_size + chunker = get_chunker(algo, *params, **kw) + sizes = [] + hashes = [] + for c in chunker.chunkify(BytesIO(data)): + if c.data is None: # hole / all-zero alloc chunk + n = c.meta["size"] + sizes.append(n) + hashes.append(hashlib.sha256(b"\0" * n).digest()) + else: + b = c.data + sizes.append(len(b)) + hashes.append(hashlib.sha256(b).digest()) + return sizes, hashes, getattr(chunker, "chunking_time", 0.0) + + +def mutate(data, n_edits, rng): + """Insert and delete a few single bytes at random offsets (boundary shift test).""" + b = bytearray(data) + for _ in range(n_edits): + pos = rng.randrange(len(b)) + if rng.random() < 0.5: + b.insert(pos, rng.randrange(256)) + else: + del b[pos] + return bytes(b) + + +def analyze(algo, data, params, shift_edits, rng, nc_level=0, normal_size=0): + min_exp, max_exp, mask_bits, win = params + min_size, max_size = 1 << min_exp, 1 << max_exp + + t0 = time.monotonic() + sizes, hashes, internal_t = chunk_stats(algo, data, *params, nc_level=nc_level, normal_size=normal_size) + wall = time.monotonic() - t0 + + # drop last chunk for distribution stats (it is a remainder, often < min) + dist_sizes = sizes[:-1] if len(sizes) > 1 else sizes + total = sum(sizes) + + mean = statistics.fmean(dist_sizes) if dist_sizes else 0 + stdev = statistics.pstdev(dist_sizes) if len(dist_sizes) > 1 else 0.0 + cv = (stdev / mean) if mean else 0.0 + min_clamped = sum(1 for s in dist_sizes if s == min_size) + max_clamped = sum(1 for s in dist_sizes if s == max_size) + + # dedup ratio: unique chunk content / total (lower = more dedup) + seen = set() + unique_bytes = 0 + for h, n in zip(hashes, sizes): + if h not in seen: + seen.add(h) + unique_bytes += n + dedup_ratio = unique_bytes / total if total else 0.0 + + # shift resilience: re-chunk a mutated copy, fraction of chunks (by content) that survive + shift_survival = None + if shift_edits: + mutated = mutate(data, shift_edits, rng) + _, mhashes, _ = chunk_stats(algo, mutated, *params, nc_level=nc_level, normal_size=normal_size) + del mutated + orig_set = set(hashes) + survived = sum(1 for h in mhashes if h in orig_set) + shift_survival = survived / len(mhashes) if mhashes else 0.0 + + mb = total / (1024 * 1024) + secs = internal_t or wall + label = algo if not nc_level else f"{algo}/nc{nc_level}" + return { + "algo": label, + "count": len(sizes), + "total_mb": mb, + "mean": mean, + "stdev": stdev, + "cv": cv, + "min_clamped": min_clamped, + "max_clamped": max_clamped, + "min_obs": min(dist_sizes) if dist_sizes else 0, + "max_obs": max(dist_sizes) if dist_sizes else 0, + "dedup_ratio": dedup_ratio, + "throughput_mbps": mb / secs if secs else float("inf"), + "shift_survival": shift_survival, + } + + +def fmt(r): + line = ( + f"{r['algo']:>13} " + f"n={r['count']:>6} " + f"mean={r['mean']/1024:8.1f}K " + f"stdev={r['stdev']/1024:8.1f}K " + f"CV={r['cv']:5.3f} " + f"min/max-clamp={r['min_clamped']:>4}/{r['max_clamped']:<4} " + f"dedup={r['dedup_ratio']:6.4f} " + f"{r['throughput_mbps']:7.1f} MB/s" + ) + if r["shift_survival"] is not None: + line += f" shift-survive={r['shift_survival']:6.4f}" + return line + + +def main(): + ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + src = ap.add_mutually_exclusive_group(required=True) + src.add_argument("--path", help="file or directory to use as corpus") + src.add_argument("--synthetic", help="random:N | lcg:N | textish:N") + ap.add_argument("--max-bytes", type=int, default=0, help="cap corpus size (0 = no cap)") + ap.add_argument( + "--algo", + nargs="+", + default=["buzhash64", "buzhash"], + help="chunker algos to compare (default: buzhash64 buzhash)", + ) + ap.add_argument("--min-exp", type=int, default=CHUNK_MIN_EXP) + ap.add_argument("--max-exp", type=int, default=CHUNK_MAX_EXP) + ap.add_argument("--mask-bits", type=int, default=HASH_MASK_BITS) + ap.add_argument("--window", type=int, default=HASH_WINDOW_SIZE) + ap.add_argument( + "--nc-level", + type=int, + default=2, + help="normalized chunking level for buzhash64; runs nc=0 AND this level (0 to disable)", + ) + ap.add_argument( + "--normal-size", + type=int, + default=0, + help="explicit NC transition size in bytes (0 = auto = min_size + 2**mask_bits)", + ) + ap.add_argument( + "--shift-edits", type=int, default=8, help="number of random insert/delete edits for shift test (0 to skip)" + ) + ap.add_argument("--repeat", type=int, default=1, help="repeat runs (throughput stability)") + ap.add_argument("--seed", type=int, default=0) + args = ap.parse_args() + + if args.synthetic: + data = gen_synthetic(args.synthetic) + corpus_desc = args.synthetic + else: + data = load_path(args.path, args.max_bytes) + corpus_desc = args.path + if args.max_bytes: + data = data[: args.max_bytes] + + params = (args.min_exp, args.max_exp, args.mask_bits, args.window) + + print(f"corpus: {corpus_desc} size: {len(data)/(1024*1024):.1f} MiB") + print( + f"params: min_exp={params[0]} max_exp={params[1]} mask_bits={params[2]} " + f"window={params[3]} (target ~{(1< 0: + variants.append((algo, args.nc_level)) + + for algo, nc in variants: + best_tput = 0.0 + last = None + for _ in range(args.repeat): + r = analyze( + algo, + data, + params, + args.shift_edits, + random.Random(args.seed), + nc_level=nc, + normal_size=args.normal_size, + ) + best_tput = max(best_tput, r["throughput_mbps"]) + last = r + last["throughput_mbps"] = best_tput # report best (least-noisy) throughput + print(fmt(last)) + + print("-" * 118) + print( + "notes: dedup<1.0 only if corpus has duplicate content; CV lower = tighter " + "size distribution; shift-survive higher = better." + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/borg/archiver/benchmark_cmd.py b/src/borg/archiver/benchmark_cmd.py index e448bbef0a..099cac8b36 100644 --- a/src/borg/archiver/benchmark_cmd.py +++ b/src/borg/archiver/benchmark_cmd.py @@ -199,8 +199,8 @@ def chunkit(ch): ), # note: the buzhash64 chunker creation is rather slow, so we must keep it in setup ( - "buzhash64,19,23,21,4095", - "ch = get_chunker('buzhash64', 19, 23, 21, 4095, sparse=False)", + "buzhash64,19,23,21,4095,2", + "ch = get_chunker('buzhash64', 19, 23, 21, 4095, 2, sparse=False)", "chunkit(ch)", locals(), ), @@ -211,7 +211,7 @@ def chunkit(ch): algo, _, algo_params = spec.partition(",") result["chunkers"].append({"algo": algo, "algo_params": algo_params, "size": size, "time": dt}) else: - print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s") + print(f"{spec:<26} {format_file_size(size):<10} {dt:.3f}s") from ..crypto.low_level import hmac_sha256, blake2b_256 import blake3 @@ -232,7 +232,7 @@ def chunkit(ch): if args.json: result["hashes"].append({"algo": spec, "size": size, "time": dt}) else: - print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s") + print(f"{spec:<26} {format_file_size(size):<10} {dt:.3f}s") from ..crypto.low_level import AES256_CTR_BLAKE2b, AES256_CTR_HMAC_SHA256 from ..crypto.low_level import AES256_OCB, CHACHA20_POLY1305 @@ -272,7 +272,7 @@ def chunkit(ch): if args.json: result["encryption"].append({"algo": spec, "size": size, "time": dt}) else: - print(f"{spec:<24} {format_file_size(size):<10} {dt:.3f}s") + print(f"{spec:<26} {format_file_size(size):<10} {dt:.3f}s") if not args.json: print("Compression ====================================================") diff --git a/src/borg/archiver/completion_cmd.py b/src/borg/archiver/completion_cmd.py index 4b973fafce..1bce0cb15d 100644 --- a/src/borg/archiver/completion_cmd.py +++ b/src/borg/archiver/completion_cmd.py @@ -708,7 +708,7 @@ def do_completion(self, args): comp_spec_choices_str = " ".join(comp_spec_choices) # Chunker params choices (static list) - chunker_params_choices = ["default", "fixed,4194304", "buzhash,19,23,21,4095", "buzhash64,19,23,21,4095"] + chunker_params_choices = ["default", "fixed,4194304", "buzhash,19,23,21,4095", "buzhash64,19,23,21,4095,2"] chunker_params_choices_str = " ".join(chunker_params_choices) # Relative time marker choices (static list) diff --git a/src/borg/chunkers/__init__.py b/src/borg/chunkers/__init__.py index dd3376985d..7282b5e8eb 100644 --- a/src/borg/chunkers/__init__.py +++ b/src/borg/chunkers/__init__.py @@ -17,7 +17,9 @@ def get_chunker(algo, *params, **kw): if algo == "buzhash": return Chunker(seed, *params, sparse=sparse) if algo == "buzhash64": - return ChunkerBuzHash64(bh64_key, *params, sparse=sparse) + # params is (chunk_min_exp, chunk_max_exp, hash_mask_bits, hash_window_size, nc_level); + # nc_level is passed positionally. normal_size is an optional tuning knob (0 = auto). + return ChunkerBuzHash64(bh64_key, *params, normal_size=kw.get("normal_size", 0), sparse=sparse) if algo == "fixed": return ChunkerFixed(*params, sparse=sparse) if algo == "fail": diff --git a/src/borg/chunkers/buzhash64.pyx b/src/borg/chunkers/buzhash64.pyx index a3c7bf1101..1a1ce8fc9c 100644 --- a/src/borg/chunkers/buzhash64.pyx +++ b/src/borg/chunkers/buzhash64.pyx @@ -109,6 +109,9 @@ cdef class ChunkerBuzHash64: It also uses a per-repo random seed to avoid some chunk length fingerprinting attacks. """ cdef uint64_t chunk_mask + cdef uint64_t mask_s, mask_l # normalized chunking: strict / loose masks + cdef size_t normal_size # chunk length at which we switch mask_s -> mask_l + cdef int nc_level # normalized chunking level (0 = disabled) cdef uint64_t* table cdef uint8_t* data cdef object _fd # Python object for file descriptor @@ -121,7 +124,7 @@ cdef class ChunkerBuzHash64: cdef size_t reader_block_size cdef bint sparse - def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, bint sparse=False): + def __cinit__(self, bytes key, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size, int nc_level=0, size_t normal_size=0, bint sparse=False): self.table = NULL self.data = NULL min_size = 1 << chunk_min_exp @@ -131,8 +134,29 @@ cdef class ChunkerBuzHash64: assert hash_window_size + min_size + 1 <= max_size, "too small max_size" self.window_size = hash_window_size - self.chunk_mask = (1 << hash_mask_bits) - 1 + self.chunk_mask = (1ULL << hash_mask_bits) - 1 self.min_size = min_size + # Normalized chunking (FastCDC-style): use a stricter mask (lower cut probability) until + # the chunk reaches its expected/normal size, then a looser mask (higher cut probability). + # This concentrates chunk sizes around the target and reduces chunk-size variance. + # nc_level == 0 disables it, keeping behavior byte-identical to the single-mask chunker. + assert nc_level >= 0 + assert hash_mask_bits - nc_level >= 1, "nc_level too large for hash_mask_bits" + assert hash_mask_bits + nc_level <= 48, "nc_level too large for hash_mask_bits" + self.nc_level = nc_level + if nc_level: + self.mask_s = (1ULL << (hash_mask_bits + nc_level)) - 1 + self.mask_l = (1ULL << (hash_mask_bits - nc_level)) - 1 + # normal_size is the chunk length at which we switch from the strict to the loose + # mask; it dominates the mean chunk size. The default is the nominal target size + # (1ULL << hash_mask_bits) minus the expected loose-phase tail (1ULL << (bits - nc_level)), + # which lands the mean close to the target instead of overshooting it. Pass an + # explicit normal_size to tune it further. + self.normal_size = normal_size if normal_size else ((1ULL << hash_mask_bits) - (1ULL << (hash_mask_bits - nc_level))) + else: + self.mask_s = self.chunk_mask + self.mask_l = self.chunk_mask + self.normal_size = 0 self.table = buzhash64_init_table(key) self.buf_size = max_size self.data = malloc(self.buf_size) @@ -196,10 +220,14 @@ cdef class ChunkerBuzHash64: cdef object process(self) except *: """Process the chunker's buffer and return the next chunk.""" - cdef uint64_t sum, chunk_mask = self.chunk_mask + cdef uint64_t sum, mask + cdef uint64_t mask_s = self.mask_s, mask_l = self.mask_l + cdef int nc_level = self.nc_level cdef size_t n, old_last, min_size = self.min_size, window_size = self.window_size + cdef size_t normal_size = self.normal_size, normal_pos cdef uint8_t* p cdef uint8_t* stop_at + cdef uint8_t* nc_stop cdef size_t did_bytes if self.done: @@ -232,11 +260,32 @@ cdef class ChunkerBuzHash64: self.remaining -= min_size sum = _buzhash64(self.data + self.position, window_size, self.table) - while self.remaining > window_size and (sum & chunk_mask) and not (self.eof and self.remaining <= window_size): + # Normalized chunking: pick the mask based on how far we are into the current chunk. + # While below normal_size use the strict mask (lower cut probability), afterward the + # loose mask (higher cut probability). The mask is re-evaluated at the top of every + # iteration, so the transition is honored exactly at normal_pos. When nc is disabled, + # mask_s == mask_l == chunk_mask and the normal_pos cap is not applied, so this reduces + # to the original single-mask behavior. + mask = mask_s + normal_pos = 0 + while True: + if nc_level: + normal_pos = self.last + normal_size + mask = mask_s if self.position < normal_pos else mask_l + + if not (self.remaining > window_size and (sum & mask) and not (self.eof and self.remaining <= window_size)): + break + p = self.data + self.position stop_at = p + self.remaining - window_size - while p < stop_at and (sum & chunk_mask): + if nc_level and self.position < normal_pos: + # do not scan past the strict->loose transition; re-evaluate the mask there + nc_stop = self.data + normal_pos + if nc_stop < stop_at: + stop_at = nc_stop + + while p < stop_at and (sum & mask): sum = _buzhash64_update(sum, p[0], p[window_size], window_size, self.table) p += 1 diff --git a/src/borg/constants.py b/src/borg/constants.py index 5c88b6b89e..319393ec30 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -111,9 +111,13 @@ HASH_WINDOW_SIZE = 0xFFF # 4095 B HASH_MASK_BITS = 21 # results in ~2 MiB chunks statistically +# buzhash64-only: normalized chunking level (0 disables it). buzhash (32bit) does not support this +# and must stay bit-compatible to borg 1.x, so it has no nc_level param. +NC_LEVEL = 2 # FastCDC-style normalized chunking: tightens chunk-size distribution (much lower variance) + # defaults, use --chunker-params to override CHUNKER_PARAMS = (CH_BUZHASH, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) -CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE) +CHUNKER64_PARAMS = (CH_BUZHASH64, CHUNK_MIN_EXP, CHUNK_MAX_EXP, HASH_MASK_BITS, HASH_WINDOW_SIZE, NC_LEVEL) # chunker params for the items metadata stream, finer granularity ITEMS_CHUNKER_PARAMS = (CH_BUZHASH, 15, 19, 17, HASH_WINDOW_SIZE) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index b8d9f89a9a..72183885fd 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -304,8 +304,11 @@ def ChunkerParams(s): return algo, block_size, header_size if algo == "default" and count == 1: # default return CHUNKER_PARAMS - if algo == CH_BUZHASH64 and count == 5: # buzhash64, chunk_min, chunk_max, chunk_mask, window_size - chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[1:]) + if algo == CH_BUZHASH64 and count == 6: + # buzhash64, chunk_min, chunk_max, chunk_mask, window_size, nc_level + # use nc_level 0 to disable normalized chunking. + chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[1:5]) + nc_level = int(params[5]) if not (chunk_min <= chunk_mask <= chunk_max): raise ArgumentTypeError("required: chunk_min <= chunk_mask <= chunk_max") if chunk_min < 6: @@ -313,8 +316,14 @@ def ChunkerParams(s): raise ArgumentTypeError("min. chunk size exponent must not be less than 6 (2^6 = 64B min. chunk size)") if chunk_max > 23: raise ArgumentTypeError("max. chunk size exponent must not be more than 23 (2^23 = 8MiB max. chunk size)") + # normalized chunking switches the mask at the target size; it needs room below and above + # the base mask bits (chunk_mask). nc_level 0 disables it. + if not (0 <= nc_level and chunk_mask - nc_level >= 1 and chunk_mask + nc_level <= 48): + raise ArgumentTypeError( + "required: 0 <= nc_level and 1 <= chunk_mask - nc_level and chunk_mask + nc_level <= 48" + ) # note that for buzhash64, there is no problem with even window_size. - return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size + return CH_BUZHASH64, chunk_min, chunk_max, chunk_mask, window_size, nc_level # this must stay last as it deals with old-style compat mode (no algorithm, 4 params, buzhash): if algo == CH_BUZHASH and count == 5 or count == 4: # [buzhash, ]chunk_min, chunk_max, chunk_mask, window_size chunk_min, chunk_max, chunk_mask, window_size = (int(p) for p in params[count - 4 :]) diff --git a/src/borg/testsuite/archiver/list_cmd_test.py b/src/borg/testsuite/archiver/list_cmd_test.py index f9bb56e58a..d5ce605890 100644 --- a/src/borg/testsuite/archiver/list_cmd_test.py +++ b/src/borg/testsuite/archiver/list_cmd_test.py @@ -250,7 +250,7 @@ def test_fingerprint(archivers, request): assert fingerprints1["input/file2"] != fingerprints4["input/file2"] # Also try with buzhash64 - cmd(archiver, "create", "--chunker-params=buzhash64,10,23,16,4095", "test5", "input") + cmd(archiver, "create", "--chunker-params=buzhash64,10,23,16,4095,2", "test5", "input") output = cmd(archiver, "list", "test5", "--format={fingerprint} {path}{NL}") fingerprints5 = {} for line in output.splitlines(): diff --git a/src/borg/testsuite/chunkers/buzhash64_test.py b/src/borg/testsuite/chunkers/buzhash64_test.py index 0bbeb4d3d5..9b19448587 100644 --- a/src/borg/testsuite/chunkers/buzhash64_test.py +++ b/src/borg/testsuite/chunkers/buzhash64_test.py @@ -110,15 +110,15 @@ def test_fuzz_bh64(worker): def rnd_key(): return os.urandom(32) - # decompose CHUNKER64_PARAMS = (algo, min_exp, max_exp, mask_bits, window_size) - algo, min_exp, max_exp, mask_bits, win_size = CHUNKER64_PARAMS + # decompose CHUNKER64_PARAMS = (algo, min_exp, max_exp, mask_bits, window_size, nc_level) + algo, min_exp, max_exp, mask_bits, win_size, nc_level = CHUNKER64_PARAMS assert algo == CH_BUZHASH64 # default chunker must be buzhash64 here keys = [b"\0" * 32] + [rnd_key() for _ in range(10)] sizes = [random.randint(1, 4 * 1024 * 1024) for _ in range(50)] for key in keys: - chunker = ChunkerBuzHash64(key, min_exp, max_exp, mask_bits, win_size) + chunker = ChunkerBuzHash64(key, min_exp, max_exp, mask_bits, win_size, nc_level) for size in sizes: # Random data data = os.urandom(size)