diff --git a/CHANGES.rst b/CHANGES.rst index 5c11ebd..787d554 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,18 +1,18 @@ -ChangeLog +Changelog ========= Version 0.1.0 2024-11-18 ------------------------ -- HashTableNT: deal with byte_order separately -- HashTableNT: give separate formats in value_format namedtuple +- HashTableNT: handle ``byte_order`` separately. +- HashTableNT: provide separate formats in the ``value_format`` namedtuple. Version 0.0.2 2024-11-10 ------------------------ -- Fixed "KV array is full" crash on 32bit platforms (and maybe also some other - int-size related issues), #27. -- Added a .update method to HashTableNT (like dict.update), #28. +- Fixed "KV array is full" crash on 32-bit platforms (and maybe also some other + integer-size related issues), #27. +- Added an ``.update()`` method to HashTableNT (like ``dict.update()``), #28. Version 0.0.1 2024-10-31 ------------------------ diff --git a/README.rst b/README.rst index 7ef0143..7f010b9 100644 --- a/README.rst +++ b/README.rst @@ -1,71 +1,63 @@ BorgHash -========= +======== -Memory-efficient hashtable implementations as a Python library, -implemented in Cython. +Memory-efficient hashtable implementations as a Python library implemented in Cython. HashTable --------- -``HashTable`` is a rather low-level implementation, usually one rather wants to -use the ``HashTableNT`` wrapper. But read on to get the basics... +``HashTable`` is a fairly low-level implementation; usually one will want to use the ``HashTableNT`` wrapper. Read on for the basics... Keys and Values ~~~~~~~~~~~~~~~ -The keys MUST be perfectly random ``bytes`` of arbitrary, but constant length, -like from a cryptographic hash (sha256, hmac-sha256, ...). -The implementation relies on this "perfectly random" property and does not -implement an own hash function, but just takes 32 bits from the given key. +The keys MUST be perfectly random ``bytes`` of arbitrary but fixed length, like from a cryptographic hash (SHA-256, HMAC-SHA-256, ...). +The implementation relies on this "perfectly random" property and does not implement its own hash function; it just takes 32 bits from the given key. -The values are binary ``bytes`` of arbitrary, but constant length. +The values are ``bytes`` of arbitrary but fixed length. -The length of the keys and values is defined when creating a ``HashTable`` -instance (after that, the length must always match that defined length). +The lengths of the keys and values are defined when creating a ``HashTable`` instance; thereafter, the lengths must always match the defined size. Implementation details ~~~~~~~~~~~~~~~~~~~~~~ -To have little memory overhead overall, the hashtable only stores uint32_t -indexes into separate keys and values arrays (short: kv arrays). +To have little memory overhead overall, the hashtable only stores ``uint32_t`` +indices into separate keys and values arrays (short: kv arrays). -A new key just gets appended to the keys array. The corresponding value gets -appended to the values array. After that, the key and value do not change their +A new key is appended to the keys array. The corresponding value is appended to the values array. After that, the key and value do not change their index as long as they exist in the hashtable and the ht and kv arrays are in memory. Even when kv pairs are deleted from ``HashTable``, the kv arrays never -shrink and the indexes of other kv pairs don't change. +shrink and the indices of other kv pairs don't change. -This is because we want to have stable array indexes for the keys/values so the -indexes can be used outside of ``HashTable`` as memory-efficient references. +This is because we want to have stable array indices for the keys/values, so the +indices can be used outside of ``HashTable`` as memory-efficient references. Memory allocated ~~~~~~~~~~~~~~~~ -For a hashtable load factor of 0.1 - 0.5, a kv array grow factor of 1.3 and +For a hashtable load factor of 0.1 – 0.5, a kv array growth factor of 1.3, and N kv pairs, memory usage in bytes is approximately: - Hashtable: from ``N * 4 / 0.5`` to ``N * 4 / 0.1`` -- Keys/Values: from ``N * len(key+value) * 1.0`` to ``N * len(key+value) * 1.3`` -- Overall: from ``N * (8 + len(key+value))`` to ``N * (40 + len(key+value) * 1.3)`` +- Keys/Values: from ``N * len(key + value) * 1.0`` to ``N * len(key + value) * 1.3`` +- Overall: from ``N * (8 + len(key + value))`` to ``N * (40 + len(key + value) * 1.3)`` -When the hashtable or the kv arrays are resized, there will be short memory -usage spikes. For the kv arrays, ``realloc()`` is used to avoid copying of -data and memory usage spikes, if possible. +When the hashtable or the kv arrays are resized, there will be brief memory-usage spikes. For the kv arrays, ``realloc()`` is used to avoid copying data and to minimize memory-usage spikes, if possible. HashTableNT ----------- ``HashTableNT`` is a convenience wrapper around ``HashTable``: -- accepts and returns ``namedtuple`` values -- implements persistence: can read (write) the hashtable from (to) a file. +- Accepts and returns ``namedtuple`` values. +- Implements persistence: can read the hashtable from a file and write it to a file. Keys and Values ~~~~~~~~~~~~~~~ Keys: ``bytes``, see ``HashTable``. -Values: any fixed type of ``namedtuple`` that can be serialized to ``bytes`` +Values: any fixed ``namedtuple`` type that can be serialized to ``bytes`` by Python's ``struct`` module using a given format string. When setting a value, it is automatically serialized. When a value is returned, @@ -75,11 +67,11 @@ Persistence ~~~~~~~~~~~ ``HashTableNT`` has ``.write()`` and ``.read()`` methods to save/load its -content to/from a file, using an efficient binary format. +contents to/from a file, using an efficient binary format. When a ``HashTableNT`` is saved to disk, only the non-deleted entries are -persisted and when it is loaded from disk, a new hashtable and new, dense -kv arrays are built - thus, kv indexes will be different! +persisted. When it is loaded from disk, a new hashtable and new, dense +kv arrays are built; thus, kv indices will be different! API --- @@ -96,15 +88,15 @@ Example code :: - # HashTableNT mapping 256bit key [bytes] --> Chunk value [namedtuple] + # HashTableNT mapping 256-bit key [bytes] --> Chunk value [namedtuple] Chunk = namedtuple("Chunk", ["refcount", "size"]) ChunkFormat = namedtuple("ChunkFormat", ["refcount", "size"]) chunk_format = ChunkFormat(refcount="I", size="I") - # 256bit (32Byte) key, 2x 32bit (4Byte) values + # 256-bit (32-byte) key, 2x 32-bit (4-byte) values ht = HashTableNT(key_size=32, value_type=Chunk, value_format=chunk_format) - key = b"x" * 32 # the key is usually from a cryptographic hash fn + key = b"x" * 32 # the key is usually from a cryptographic hash function value = Chunk(refcount=1, size=42) ht[key] = value assert ht[key] == value @@ -131,9 +123,9 @@ Want a demo? Run ``borghash-demo`` after installing the ``borghash`` package. -It will show you the demo code, run it and print the results for your machine. +It will show you the demo code, run it, and print the results for your machine. -Results on an Apple MacBook Pro (M3 Pro CPU) are like: +Results on an Apple MacBook Pro (M3 Pro CPU) look like: :: @@ -144,18 +136,18 @@ Results on an Apple MacBook Pro (M3 Pro CPU) are like: State of this project --------------------- -**API is still unstable and expected to change as development goes on.** +**API is still unstable and expected to change as development continues.** **As long as the API is unstable, there will be no data migration tools, -like e.g. for reading an existing serialized hashtable.** +e.g., for reading an existing serialized hashtable.** -There might be missing features or optimization potential, feedback welcome! +There might be missing features or optimization potential; feedback is welcome! Borg? ----- Please note that this code is currently **not** used by the stable release of -BorgBackup (aka "borg"), but might be used by borg master branch in the future. +BorgBackup (aka "borg"), but it might be used by Borg's master branch in the future. License ------- diff --git a/setup.py b/setup.py index 27475bb..ba679d0 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ try: from Cython.Build import cythonize except ImportError: - cythonize = None # we don't have cython installed + cythonize = None # we don't have Cython installed ext = '.pyx' if cythonize else '.c' diff --git a/src/borghash/HashTable.pyx b/src/borghash/HashTable.pyx index 633a42e..ebcae61 100644 --- a/src/borghash/HashTable.pyx +++ b/src/borghash/HashTable.pyx @@ -1,8 +1,8 @@ """ -HashTable: low-level ht mapping fully random bytes keys to bytes values. - key and value length can be chosen, but is fixed afterwards. - the keys and values are stored in arrays separate from the hashtable. - the hashtable only stores the 32bit indexes into the key/value arrays. +HashTable: low-level hash table mapping fully random bytes keys to bytes values. + Key and value lengths can be chosen, but are fixed thereafter. + The keys and values are stored in arrays separate from the hashtable. + The hashtable only stores the 32-bit indices into the key/value arrays. """ from __future__ import annotations from typing import BinaryIO, Iterator, Any @@ -49,7 +49,7 @@ cdef class HashTable: shrink_factor: float = 0.4, grow_factor: float = 2.0, kv_grow_factor: float = 1.3) -> None: # the load of the ht (.table) shall be between 0.25 and 0.5, so it is fast and has few collisions. - # it is cheap to have a low hash table load, because .table only stores uint32_t indexes into the + # it is cheap to have a low hash table load, because .table only stores uint32_t indices into the # .keys and .values array. # the keys/values arrays have bigger elements and are not hash tables, thus collisions and load # factor are no concern there. the kv_grow_factor can be relatively small. @@ -96,7 +96,7 @@ cdef class HashTable: free(self.values) def clear(self) -> None: - """empty HashTable, start from scratch""" + """Empty the HashTable and start from scratch.""" self.capacity = 0 self.used = 0 self._resize_table(self.initial_capacity) @@ -107,7 +107,7 @@ cdef class HashTable: return self.used cdef size_t _get_index(self, uint8_t* key): - """key must be perfectly random distributed bytes, so we don't need a hash function here.""" + """Key must be perfectly random bytes, so we don't need a hash function here.""" cdef uint32_t key32 = (key[0] << 24) | (key[1] << 16) | (key[2] << 8) | key[3] return key32 % self.capacity @@ -149,7 +149,7 @@ cdef class HashTable: self._resize_kv(int(self.kv_capacity * self.kv_grow_factor)) if self.kv_used >= self.kv_capacity: # Should never happen. See "RESERVED" constant - we allow almost 4Gi kv entries. - # For a typical 256bit key and a small 32bit value that would already consume 176GiB+ + # For a typical 256-bit key and a small 32-bit value that would already consume 176GiB+ # memory (plus spikes to even more when hashtable or kv arrays get resized). raise RuntimeError("KV array is full") @@ -260,7 +260,7 @@ cdef class HashTable: self.tombstones = 0 cdef void _resize_kv(self, size_t new_capacity): - # We must never use kv indexes >= RESERVED, thus we'll never need more capacity either. + # We must never use kv indices >= RESERVED; thus, we'll never need more capacity either. cdef size_t capacity = min(new_capacity, RESERVED - 1) self.stats_resize_kv += 1 # realloc is already highly optimized (in Linux). By using mremap internally only the peak address space usage is "old size" + "new size", while the peak memory usage is only "new size". @@ -270,8 +270,8 @@ cdef class HashTable: def k_to_idx(self, key: bytes) -> int: """ - return the key's index in the keys array (index is stable while in memory). - this can be used to "abbreviate" a known key (e.g. 256bit key -> 32bit index). + Return the key's index in the keys array (index is stable while in memory). + This can be used to "abbreviate" a known key (e.g., 256-bit key -> 32-bit index). """ if len(key) != self.ksize: raise ValueError("Key size does not match the defined size") @@ -283,16 +283,16 @@ cdef class HashTable: def idx_to_k(self, idx: int) -> bytes: """ - for a given index, return the key stored at that index in the keys array. - this is the reverse of k_to_idx (e.g. 32bit index -> 256bit key). + For a given index, return the key stored at that index in the keys array. + This is the reverse of k_to_idx (e.g., 32-bit index -> 256-bit key). """ cdef uint32_t kv_index = idx return self.keys[kv_index * self.ksize:(kv_index + 1) * self.ksize] def kv_to_idx(self, key: bytes, value: bytes) -> int: """ - return the key's/value's index in the keys/values array (index is stable while in memory). - this can be used to "abbreviate" a known key/value pair. (e.g. 256bit key + 32bit value -> 32bit index). + Return the key's/value's index in the keys/values array (index is stable while in memory). + This can be used to "abbreviate" a known key/value pair (e.g., 256-bit key + 32-bit value -> 32-bit index). """ if len(key) != self.ksize: raise ValueError("Key size does not match the defined size") @@ -309,8 +309,8 @@ cdef class HashTable: def idx_to_kv(self, idx: int) -> tuple[bytes, bytes]: """ - for a given index, return the key/value stored at that index in the keys/values array. - this is the reverse of kv_to_idx (e.g. 32bit index -> 256bit key + 32bit value). + For a given index, return the key/value stored at that index in the keys/values array. + This is the reverse of kv_to_idx (e.g., 32-bit index -> 256-bit key + 32-bit value). """ cdef uint32_t kv_index = idx key = self.keys[kv_index * self.ksize:(kv_index + 1) * self.ksize] diff --git a/src/borghash/HashTableNT.pyx b/src/borghash/HashTableNT.pyx index 03311cd..b7a9aa0 100644 --- a/src/borghash/HashTableNT.pyx +++ b/src/borghash/HashTableNT.pyx @@ -36,7 +36,7 @@ cdef class HashTableNT: if not all(isinstance(fmt, str) and len(fmt) > 0 for fmt in value_format): raise ValueError("value_format's elements must be str and non-empty.") if byte_order not in BYTE_ORDER: - raise ValueError("byte_order must be one of: {','.join(BYTE_ORDER.keys())}") + raise ValueError(f"byte_order must be one of: {', '.join(BYTE_ORDER.keys())}") self.key_size = key_size self.value_type = value_type self.value_format = value_format @@ -124,7 +124,7 @@ cdef class HashTableNT: return self._to_namedtuple_value(binary_value) def update(self, other=(), /, **kwds): - """Like dict.update, but other can also be a HashTableNT instance.""" + """Like dict.update(), but 'other' can also be a HashTableNT instance.""" if isinstance(other, HashTableNT): for key, value in other.items(): self[key] = value @@ -228,9 +228,9 @@ cdef class HashTableNT: def size(self) -> int: """ - do a rough worst-case estimate of the on-disk size when using .write(). + Do a rough worst-case estimate of the on-disk size when using .write(). - the serialized size of the metadata is a bit hard to predict, but we cover that with one_time_overheads. + The serialized size of the metadata is a bit hard to predict, but we cover that with one_time_overheads. """ one_time_overheads = 4096 # very rough N = self.inner.used diff --git a/src/borghash/__init__.py b/src/borghash/__init__.py index a5d2f3b..cb0e10b 100644 --- a/src/borghash/__init__.py +++ b/src/borghash/__init__.py @@ -1,5 +1,5 @@ """ -borghash - hashtable implementations in cython. +borghash - hashtable implementations in Cython. """ from .HashTable import HashTable from .HashTableNT import HashTableNT diff --git a/src/borghash/__main__.py b/src/borghash/__main__.py index 8c49b4d..7eacf27 100644 --- a/src/borghash/__main__.py +++ b/src/borghash/__main__.py @@ -1,5 +1,5 @@ """ -Demonstration of borghash. +Demonstration of BorgHash. """ def demo(): @@ -17,12 +17,12 @@ def demo(): value_type = namedtuple("Chunk", ["refcount", "size"]) value_format_t = namedtuple("ChunkFormat", ["refcount", "size"]) value_format = value_format_t(refcount="I", size="I") -# 256bit (32Byte) key, 2x 32bit (4Byte) values +# 256-bit (32-byte) key, 2x 32-bit (4-byte) values ht = HashTableNT(key_size=32, value_type=value_type, value_format=value_format) t0 = time() for i in range(count): - # make up a 256bit key from i, first 32bits need to be well distributed. + # Make up a 256-bit key from i; the first 32 bits need to be well distributed. key = f"{i:4x}{' '*28}".encode() value = value_type(refcount=i, size=i * 2) ht[key] = value @@ -50,7 +50,7 @@ def demo(): t4 = time() for i in range(count): - # make up a 256bit key from i, first 32bits need to be well distributed. + # Make up a 256-bit key from i; the first 32 bits need to be well distributed. key = f"{i:4x}{' '*28}".encode() expected_value = value_type(refcount=i, size=i * 2) assert ht_read.pop(key) == expected_value diff --git a/tests/benchmark_test.py b/tests/benchmark_test.py index e0da009..24e68bb 100644 --- a/tests/benchmark_test.py +++ b/tests/benchmark_test.py @@ -18,8 +18,8 @@ @pytest.fixture(scope="module") def items(): - # use quite a lot of items to reduce issues with timer resolution - # and outside influences onto the measurement. + # Use many items to reduce issues with timer resolution + # and external influences on the measurement. items = [] for x in range(1000000): key = H2(x) @@ -29,15 +29,15 @@ def items(): return frozenset(items) -def bh(): # borghash +def bh(): # BorgHash return HashTable(key_size=KEY_SIZE, value_size=VALUE_SIZE) -def bhnt(): # borghash +def bhnt(): # BorgHash return HashTableNT(key_size=KEY_SIZE, value_type=VALUE_TYPE, value_format=VALUE_FMT) -def pd(): # python dict +def pd(): # Python dict return dict() @@ -65,7 +65,7 @@ def func(ht, items, nt): def test_update(benchmark, ht_class, nt, items): def func(ht, items, nt): for key, value_raw, value_nt in items: - ht[key] = value_nt if nt else value_raw # update value for an existing ht entry + ht[key] = value_nt if nt else value_raw # Update the value for an existing hashtable entry benchmark.pedantic(func, setup=lambda: setup(ht_class, items, fill=True, nt=nt)) diff --git a/tests/hashtable_stress_test.py b/tests/hashtable_stress_test.py index 1b6713c..609d13e 100644 --- a/tests/hashtable_stress_test.py +++ b/tests/hashtable_stress_test.py @@ -8,20 +8,20 @@ def H(x, y): """ - Create a 256bit key - x will determine the first 32 bits, y will determine the last 32bit. - As our HashTable computes the ht index from first 32 bits, same x will give same ht index (a collision). + Create a 256-bit key; x determines the first 32 bits, y determines the last 32 bits. + As our HashTable computes the hash table (ht) index from the first 32 bits, the same x will give the same index (a collision). """ - return struct.pack(">IIIIIIII", x, 0, 0, 0, 0, 0, 0, y) # BE is easier to read. + return struct.pack(">IIIIIIII", x, 0, 0, 0, 0, 0, 0, y) # Big-endian (BE) is easier to read. @pytest.fixture def ht(): - # 256bit keys, 32bit values + # 256-bit keys, 32-bit values return HashTable(key_size=32, value_size=4) def check(ht, pydict, destructive=False): - """check if ht has same contents as pydict""" + """Check whether ht has the same contents as pydict.""" assert len(ht) == len(pydict) assert dict(ht.items()) == pydict for key, value in pydict.items(): @@ -37,7 +37,7 @@ def check(ht, pydict, destructive=False): def test_few_collisions_stress(ht): pydict = {} for h in range(10000): - key = H(h, h) # few collisions + key = H(h, h) # Few collisions value = key[-4:] ht[key] = value pydict[key] = value @@ -47,7 +47,7 @@ def test_few_collisions_stress(ht): def test_many_collisions_stress(ht): pydict = {} for h in range(10000): - key = H(0, h) # everything collides + key = H(0, h) # Everything collides value = key[-4:] ht[key] = value pydict[key] = value @@ -64,7 +64,7 @@ def new_random_keys(count): keys = set() while len(keys) < count: x = random.randint(0, UINT32_MAX) - key = H(x, x) # few collisions + key = H(x, x) # Few collisions keys.add(key) return keys diff --git a/tests/hashtable_test.py b/tests/hashtable_test.py index 57bcd83..4622a43 100644 --- a/tests/hashtable_test.py +++ b/tests/hashtable_test.py @@ -4,25 +4,25 @@ from borghash import HashTable -# 256bit keys, 32bit values +# 256-bit keys, 32-bit values key1, value1 = b"a" * 32, b"A" * 4 key2, value2 = b"b" * 32, b"B" * 4 key3, value3 = b"c" * 32, b"C" * 4 def H(x): - # make some 32byte long thing that depends on x + # Make a 32-byte-long value that depends on x return bytes("%-0.32d" % x, "ascii") def H2(x): - # like H(x), but with pseudo-random distribution of the output value + # Like H(x), but with a pseudo-random distribution of the output value. return hashlib.sha256(H(x)).digest() @pytest.fixture def ht(): - # 8 entries initially, 256bit keys, 4Byte (32bit) values + # 8 entries initially, 256-bit keys, 4-byte (32-bit) values return HashTable(key_size=32, value_size=4) diff --git a/tests/hashtablent_test.py b/tests/hashtablent_test.py index 306f7c1..cad2498 100644 --- a/tests/hashtablent_test.py +++ b/tests/hashtablent_test.py @@ -7,10 +7,10 @@ from .hashtable_test import H2 -key_size = 32 # 32 bytes = 256bits key +key_size = 32 # 32 bytes = 256-bit key value_type = namedtuple("vt", "v1 v2 v3") value_format_t = namedtuple("vf", "v1 v2 v3") -value_format = value_format_t(v1="I", v2="I", v3="I") # 3x little endian 32bit unsigned int +value_format = value_format_t(v1="I", v2="I", v3="I") # 3x little-endian 32-bit unsigned integers key1, value1 = b"a" * 32, value_type(11, 12, 13) key2, value2 = b"b" * 32, value_type(21, 22, 23) @@ -108,7 +108,7 @@ def test_update_ntht(ntht12, ntht): def test_ntht_stress(ntht): - # this also triggers some hashtable resizing + # This also triggers some hashtable resizing. keys = set() for i in range(10000): key = H2(i) @@ -170,20 +170,20 @@ def test_read_write(ntht12, tmp_path): @pytest.mark.parametrize("n", [1000, 10000, 100000, 1000000]) def test_size(ntht, n): - # fill the ht + # Fill the hashtable. for i in range(n): key = H2(i) v = key[0] - # use mid-size integers as values (not too small, not too big) + # Use mid-size integers as values (not too small, not too big). value = value_type(v * 123456, v * 234567, v * 345678) ntht[key] = value - # estimate size + # Estimate size. estimated_size = ntht.size() - # serialize and determine real size + # Serialize and determine the actual size. with BytesIO() as f: ntht.write(f) real_size = f.tell() - # is our estimation good enough? + # Is our estimate good enough? assert estimated_size * 0.9 < real_size < estimated_size * 1.0