diff --git a/src/lean_spec/subspecs/node/anchor.py b/src/lean_spec/subspecs/node/anchor.py index 80618d9a..0e27aa9c 100644 --- a/src/lean_spec/subspecs/node/anchor.py +++ b/src/lean_spec/subspecs/node/anchor.py @@ -14,9 +14,14 @@ from typing import cast -from lean_spec.forks import ForkProtocol, Store, Validators -from lean_spec.forks.lstar.containers import Block, BlockBody -from lean_spec.forks.lstar.containers.block.types import AggregatedAttestations +from lean_spec.forks import ( + AggregatedAttestations, + Block, + BlockBody, + ForkProtocol, + Store, + Validators, +) from lean_spec.subspecs.genesis import GenesisConfig from lean_spec.subspecs.networking.reqresp.message import Status from lean_spec.subspecs.ssz.hash import hash_tree_root diff --git a/src/lean_spec/types/rlp.py b/src/lean_spec/types/rlp.py index 2ee8b9f7..bc79e20f 100644 --- a/src/lean_spec/types/rlp.py +++ b/src/lean_spec/types/rlp.py @@ -1,176 +1,181 @@ """ -Recursive Length Prefix (RLP) Encoding -====================================== +Recursive Length Prefix (RLP) encoding for Ethereum. -RLP is Ethereum's serialization format for arbitrary nested binary data. -It is used for encoding transactions, blocks, ENR records, and more. +RLP serializes byte strings and arbitrarily nested lists of byte strings. -Encoding Rules --------------- +It is the wire format for transactions, blocks, ENR records, and devp2p messages. -RLP encodes two types of items: +Encoding rules (Yellow Paper Appendix B): -1. **Byte strings** (including empty string) -2. **Lists** of items (including empty list) + prefix range meaning + 0x00 .. 0x7f single byte payload, encoded as itself + 0x80 .. 0xb7 string of 0 to 55 bytes, length = prefix - 0x80 + 0xb8 .. 0xbf long string, prefix - 0xb7 = number of length bytes that follow + 0xc0 .. 0xf7 list with 0 to 55 payload bytes, length = prefix - 0xc0 + 0xf8 .. 0xff long list, prefix - 0xf7 = number of length bytes that follow -Byte ranges determine the encoding: - -+-------------+-----------------------------------------------------------+ -| Prefix | Meaning | -+=============+===========================================================+ -| [0x00-0x7f] | Single byte, value is the byte itself | -+-------------+-----------------------------------------------------------+ -| [0x80-0xb7] | Short string (0-55 bytes), length = prefix - 0x80 | -+-------------+-----------------------------------------------------------+ -| [0xb8-0xbf] | Long string (>55 bytes), prefix - 0xb7 = length of length | -+-------------+-----------------------------------------------------------+ -| [0xc0-0xf7] | Short list (0-55 bytes payload), length = prefix - 0xc0 | -+-------------+-----------------------------------------------------------+ -| [0xf8-0xff] | Long list (>55 bytes payload), prefix - 0xf7 = len of len | -+-------------+-----------------------------------------------------------+ +Strings and lists share the same length-prefix structure. +The only difference is the base byte: 0x80 for strings, 0xc0 for lists. +The 0x40 gap is the entire string-vs-list distinction. References: ----------- -- Ethereum Yellow Paper, Appendix B +- Ethereum Yellow Paper, Appendix B. - https://ethereum.org/en/developers/docs/data-structures-and-encoding/rlp/ -- https://github.com/ethereum/pyrlp """ -from __future__ import annotations - +import math from typing import Final type RLPItem = bytes | list[RLPItem] -""" -RLP-encodable item. +"""A byte string or a list of items recursively.""" -Either: -- bytes (a byte string) -- list of RLP items (recursive) -""" +STRING_BASE: Final = 0x80 +"""The byte at which byte-string prefixes begin. -SINGLE_BYTE_MAX: Final = 0x7F -"""Boundary between single-byte encoding [0x00-0x7f] and string prefix.""" +Anything below (0x00 through 0x7F) is its own encoding, with no prefix needed. +From this byte upward, a prefix carries either the string's length or a length-of-length. +The boundary keeps small single bytes free of encoding overhead. +""" -SHORT_STRING_PREFIX: Final = 0x80 -"""Prefix for short strings (0-55 bytes). Final prefix = 0x80 + length.""" +LIST_BASE: Final = 0xC0 +"""The byte at which list prefixes begin. -SHORT_STRING_MAX_LEN: Final = 55 -"""Maximum string length for short encoding.""" +Lists and byte strings share the same length-prefix structure, just shifted by 0x40. +That offset is what tells the decoder to open a list instead of a string. +0xC0 sits above all 64 string prefixes (0x80 to 0xBF), leaving them undisturbed. +""" -LONG_STRING_BASE: Final = 0xB7 -"""Base for long string prefix. Final prefix = 0xb7 + length_of_length.""" +SHORT_FORM_MAX: Final = 55 +"""The largest payload length that fits the compact one-byte prefix. -SHORT_LIST_PREFIX: Final = 0xC0 -"""Prefix for short lists (0-55 bytes payload). Final prefix = 0xc0 + length.""" +- Smaller payloads encode with just base + length as the prefix, no extra bytes. +- Larger payloads switch to the long form, where the prefix carries a length-of-length. -SHORT_LIST_MAX_LEN: Final = 55 -"""Maximum list payload length for short encoding.""" +Why 55: each base owns 64 prefix values. +Lengths 0 through 55 take 56 of them, leaving 8 for long-form variants. +""" -LONG_LIST_BASE: Final = 0xF7 -"""Base for long list prefix. Final prefix = 0xf7 + length_of_length.""" + +class RLPDecodingError(Exception): + """Raised when RLP input is malformed or non-canonical.""" def encode_rlp(item: RLPItem) -> bytes: """ - Encode an item using RLP. + Encode bytes or a nested list of bytes to RLP. Args: - item: Bytes or nested list of bytes to encode. + item: A byte string or a list of items (lists may nest arbitrarily). Returns: - RLP-encoded bytes. + The RLP encoding of the input. Raises: - TypeError: If item is not bytes or list. + TypeError: If any element is neither bytes nor list. """ + # Bytes branch: either bare-byte fast path or length-prefixed string. + # + # A single byte below 0x80 is its own complete encoding. + # The spec writes this boundary as "< 0x80" so the literal matches the prose. + # + # Examples: + # + # b"\x7f" -> 7f (bare byte, no prefix) + # b"\x80" -> 81 80 (length-prefixed: single byte at or above 0x80) + # b"" -> 80 (empty string is the length-zero short form) + # b"dog" -> 83 64 6f 67 (short string of length 3) if isinstance(item, bytes): - return _encode_bytes(item) + if len(item) == 1 and item[0] < STRING_BASE: + return item + return _with_length_prefix(item, base=STRING_BASE) + + # List branch: encode each element, concatenate, then wrap with a list prefix. + # + # Recursion handles nesting at any depth. + # The list base 0xC0 is what distinguishes lists from strings on the wire. + # + # Examples: + # + # [] -> c0 (empty list) + # [b"a", b"b"] -> c2 61 62 (short list, payload "a" "b") + # [b"dog", b"god"] -> c8 83 64 6f 67 83 67 6f 64 (two short strings inside) if isinstance(item, list): - return _encode_list(item) - raise TypeError(f"Cannot RLP encode type: {type(item).__name__}") + payload = b"".join(encode_rlp(element) for element in item) + return _with_length_prefix(payload, base=LIST_BASE) + raise TypeError(f"Cannot RLP encode type: {type(item).__name__}") -def _encode_bytes(data: bytes) -> bytes: - """ - Encode a byte string. - Single bytes in [0x00, 0x7f] encode as themselves. - Short strings (0-55 bytes) use prefix 0x80 + length. - Long strings (>55 bytes) use prefix 0xb7 + length-of-length, then length. +def _with_length_prefix(payload: bytes, base: int) -> bytes: """ - length = len(data) - - # Single byte encoding: values 0x00-0x7f encode as themselves. - if length == 1 and data[0] <= SINGLE_BYTE_MAX: - return data - - # Short string: 0-55 bytes. - if length <= SHORT_STRING_MAX_LEN: - return bytes([SHORT_STRING_PREFIX + length]) + data - - # Long string: >55 bytes. - length_bytes = _encode_length(length) - return bytes([LONG_STRING_BASE + len(length_bytes)]) + length_bytes + data + Wrap a payload with an RLP length prefix. + Args: + payload: Already-encoded body to wrap. + base: 0x80 for strings, 0xC0 for lists. -def _encode_list(items: list[RLPItem]) -> bytes: - """ - Encode a list of items. - - Recursively encodes each item, concatenates, then adds list prefix. - Short lists (0-55 bytes payload) use prefix 0xc0 + length. - Long lists (>55 bytes payload) use prefix 0xf7 + length-of-length, then length. + Returns: + Prefix byte, optional length bytes, then the payload. """ - # Recursively encode all items. - payload = b"".join(encode_rlp(item) for item in items) length = len(payload) - # Short list: 0-55 bytes payload. - if length <= SHORT_LIST_MAX_LEN: - return bytes([SHORT_LIST_PREFIX + length]) + payload - - # Long list: >55 bytes payload. - length_bytes = _encode_length(length) - return bytes([LONG_LIST_BASE + len(length_bytes)]) + length_bytes + payload - - -def _encode_length(value: int) -> bytes: - """ - Encode length as minimal big-endian bytes. - - Used for long string/list length encoding where length > 55. - Returns minimal representation with no leading zeros. - """ - if value == 0: - # Defensive: should never be called with 0 for valid long encodings. - return b"" - return value.to_bytes((value.bit_length() + 7) // 8, "big") - - -class RLPDecodingError(Exception): - """Error during RLP decoding.""" + # Short form (payload of 0 to 55 bytes). + # + # The prefix byte alone carries the length. + # No separate length field is needed. + # + # Example: base = 0x80, payload = b"dog" (length 3) + # + # prefix = 0x80 + 3 = 0x83 + # output = 83 64 6f 67 (prefix, then "dog") + if length <= SHORT_FORM_MAX: + return bytes([base + length]) + payload + + # Long form (payload of 56 bytes or more). + # + # The length itself is written as a minimal big-endian field. + # The prefix byte encodes how many bytes that field occupies. + # + # For a payload of N bytes: + # + # - N.bit_length() 1-indexed position of the highest set bit of N. + # - math.ceil(bit_length / 8) minimal byte count needed to hold N. + # - base + 55 + that byte count prefix byte for this long encoding. + # + # The +55 lifts the prefix above the short range. + # The byte count is what differentiates each long prefix value. + # + # Example: base = 0x80, payload of length 1024 + # + # bit_length(1024) = 11 + # math.ceil(11 / 8) = 2 + # length_bytes = 04 00 (1024 big-endian, two bytes) + # prefix = 0x80 + 55 + 2 = 0xB9 + # output = B9 04 00 [1024 bytes of payload] + length_bytes = length.to_bytes(math.ceil(length.bit_length() / 8), "big") + return bytes([base + SHORT_FORM_MAX + len(length_bytes)]) + length_bytes + payload def decode_rlp(data: bytes) -> RLPItem: """ - Decode RLP-encoded bytes. + Decode a single RLP item from the full input. Args: - data: RLP-encoded bytes. + data: RLP-encoded bytes containing exactly one top-level item. Returns: - Decoded item (bytes or nested list). + The decoded byte string or list. Raises: - RLPDecodingError: If data is malformed. + RLPDecodingError: If the input is empty, truncated, has trailing bytes, or is non-canonical. """ if len(data) == 0: raise RLPDecodingError("Empty RLP data") item, consumed = _decode_item(data, 0) + # Reject trailing data so each input maps to exactly one item. if consumed != len(data): raise RLPDecodingError(f"Trailing data: decoded {consumed} of {len(data)} bytes") @@ -179,123 +184,200 @@ def decode_rlp(data: bytes) -> RLPItem: def decode_rlp_list(data: bytes) -> list[bytes]: """ - Decode RLP data as a flat list of byte items. + Decode an RLP list of byte strings, rejecting nested lists. - This is a convenience function for cases like ENR where - we expect a flat list of byte strings (no nested lists). + Used by callers that expect a flat record such as ENR. Args: - data: RLP-encoded bytes. + data: RLP-encoded bytes that must decode to a list of byte strings. Returns: - List of decoded byte strings. + The decoded byte strings in order. Raises: - RLPDecodingError: If data is not a list or contains nested lists. + RLPDecodingError: If the input is not a flat list of byte strings. """ item = decode_rlp(data) + # Top-level must be a list, not a bare byte string. if not isinstance(item, list): raise RLPDecodingError("Expected RLP list") + # Validate every element while building the narrowed result. + # + # Each iteration proves the element is bytes before appending. + # The new list carries the precise element type without needing a cast. result: list[bytes] = [] - for i, elem in enumerate(item): - if not isinstance(elem, bytes): - raise RLPDecodingError(f"Element {i} is not bytes") - result.append(elem) - + for index, element in enumerate(item): + if not isinstance(element, bytes): + raise RLPDecodingError(f"Element {index} is not bytes") + result.append(element) return result def _decode_item(data: bytes, offset: int) -> tuple[RLPItem, int]: """ - Decode a single RLP item starting at offset. + Decode one item starting at offset and report how many bytes it consumed. - Returns (decoded_item, bytes_consumed). - """ - if offset >= len(data): - raise RLPDecodingError("Unexpected end of data") + The prefix byte selects bare-byte, string, or list dispatch. + String and list paths share length parsing because the only difference is the base. - prefix = data[offset] - - # Single byte: 0x00-0x7f. - if prefix <= SINGLE_BYTE_MAX: - return data[offset : offset + 1], offset + 1 - - # Short string: 0x80-0xb7. - if prefix <= LONG_STRING_BASE: - length = prefix - SHORT_STRING_PREFIX - start = offset + 1 - end = start + length - _check_bounds(data, end) - return data[start:end], end - - # Long string: 0xb8-0xbf. - if prefix < SHORT_LIST_PREFIX: - len_of_len = prefix - LONG_STRING_BASE - start = offset + 1 - _check_bounds(data, start + len_of_len) + Args: + data: Full input buffer. + offset: Position of this item's prefix byte. - # Validate: no leading zeros in length encoding. - if len_of_len > 1 and data[start] == 0: - raise RLPDecodingError("Non-canonical: leading zeros in length encoding") + Returns: + A pair of decoded item and absolute offset of the next byte. - length = int.from_bytes(data[start : start + len_of_len], "big") + Raises: + RLPDecodingError: On truncation, non-canonical length, or non-minimal length bytes. + """ + # The caller guarantees the offset is within bounds. + # + # Top-level entry rejects empty input before any recursion. + # Recursive entry only fires while the cursor lies inside the parent list payload. + # The length helper bounds every payload by the buffer length. + prefix = data[offset] - # Validate: length must require this many bytes. - if length <= SHORT_STRING_MAX_LEN: - raise RLPDecodingError("Non-canonical: long string encoding for short string") + # Bare byte: prefix below 0x80 is the entire payload. + # + # Example: data = 7f, offset = 0 + # + # prefix = 0x7f (below 0x80, no length field) + # item = b"\x7f" + # next offset = 1 + if prefix < STRING_BASE: + return data[offset : offset + 1], offset + 1 - payload_start = start + len_of_len - payload_end = payload_start + length - _check_bounds(data, payload_end) + # String or list dispatch by prefix family. + # + # The string range ends at 0xC0 (exclusive) and the list range starts there. + # The shared length helper resolves payload bounds for either family. + base = STRING_BASE if prefix < LIST_BASE else LIST_BASE + payload_start, payload_end = _decode_length(data, offset, base) + + # String branch: payload bytes are the decoded value. + # + # Example: data = 83 64 6f 67, offset = 0 + # + # prefix = 0x83 (0x80 + 3, short string of length 3) + # payload range = [1, 4) + # item = b"dog" + # next offset = 4 + if base == STRING_BASE: return data[payload_start:payload_end], payload_end - # Short list: 0xc0-0xf7. - if prefix <= LONG_LIST_BASE: - length = prefix - SHORT_LIST_PREFIX - start = offset + 1 - end = start + length - _check_bounds(data, end) - return _decode_list_payload(data, start, end), end - - # Long list: 0xf8-0xff. - len_of_len = prefix - LONG_LIST_BASE - start = offset + 1 - _check_bounds(data, start + len_of_len) - - # Validate: no leading zeros in length encoding. - if len_of_len > 1 and data[start] == 0: - raise RLPDecodingError("Non-canonical: leading zeros in length encoding") + # List branch: drain items from the bounded payload range. + # + # The cursor advances by each child item's consumed bytes. + # It must land exactly on the payload end after the final item. + # A mismatch means an inner item declared a length that overflowed the list boundary. + # + # Example: data = c8 83 64 6f 67 83 67 6f 64, offset = 0 + # + # prefix = 0xc8 (0xc0 + 8, short list of 8 payload bytes) + # payload range = [1, 9) + # cursor walk = 1 -> 5 -> 9 (two short strings consumed in turn) + # items = [b"dog", b"god"] + items: list[RLPItem] = [] + cursor = payload_start + while cursor < payload_end: + inner, cursor = _decode_item(data, cursor) + items.append(inner) + if cursor != payload_end: + raise RLPDecodingError("List payload length mismatch") + return items, payload_end - length = int.from_bytes(data[start : start + len_of_len], "big") - # Validate: length must require this many bytes. - if length <= SHORT_LIST_MAX_LEN: - raise RLPDecodingError("Non-canonical: long list encoding for short list") +def _decode_length(data: bytes, offset: int, base: int) -> tuple[int, int]: + """ + Resolve the payload range for a string or list prefix. - payload_start = start + len_of_len - payload_end = payload_start + length - _check_bounds(data, payload_end) - return _decode_list_payload(data, payload_start, payload_end), payload_end + # Why canonicalization checks + RLP must have one canonical encoding per value to be hash-deterministic. + Two rules enforce this: -def _decode_list_payload(data: bytes, start: int, end: int) -> list[RLPItem]: - """Decode list payload between start and end offsets.""" - items: list[RLPItem] = [] - offset = start + - Length bytes themselves must be minimal, so a leading zero in a multi-byte length is invalid. + - A payload short enough for the short form must not appear in the long form. - while offset < end: - item, offset = _decode_item(data, offset) - items.append(item) + Both rules are consensus-critical for ENR. - if offset != end: - raise RLPDecodingError("List payload length mismatch") + Args: + data: Full input buffer. + offset: Position of the prefix byte. + base: 0x80 for strings, 0xC0 for lists. - return items + Returns: + Start and end offsets of the payload within the buffer. + Raises: + RLPDecodingError: On truncation or non-canonical length encoding. + """ + prefix = data[offset] + short_length = prefix - base + + # Phase 1: short form (prefix in base..base+55). + # + # The low bits of the prefix carry the payload length directly. + # The payload starts right after the prefix byte. + # + # Example: data = 83 64 6f 67, offset = 0, base = 0x80 + # + # prefix = 0x83 + # short_length = 0x83 - 0x80 = 3 + # payload range = [1, 4) (bytes 64 6f 67 = "dog") + if short_length <= SHORT_FORM_MAX: + start = offset + 1 + end = start + short_length + if end > len(data): + raise RLPDecodingError(f"Data too short: need {end}, have {len(data)}") + return start, end + + # Phase 2: long form (prefix in base+56..base+63). + # + # The low bits of the prefix carry the length-of-length. + # The next length-of-length bytes form a big-endian length field. + # The payload starts right after the length field. + # + # Example: data = b9 04 00 [1024 payload bytes], offset = 0, base = 0x80 + # + # prefix = 0xb9 + # short_length = 0xb9 - 0x80 = 57 + # len_of_len = 57 - 55 = 2 + # length = int(04 00, big) = 1024 + # payload range = [3, 1027) + len_of_len = short_length - SHORT_FORM_MAX + length_start = offset + 1 + length_end = length_start + len_of_len + if length_end > len(data): + raise RLPDecodingError(f"Data too short: need {length_end}, have {len(data)}") + + # Canonicalization check: leading-zero length bytes are forbidden. + # + # A leading zero would give the same length value with a shorter encoding. + # Allowing both forms would produce two valid encodings for the same item. + # + # Example: input b9 00 38 [56 bytes] is rejected. + # The shorter equivalent b8 38 [56 bytes] is the canonical form. + if len_of_len > 1 and data[length_start] == 0: + raise RLPDecodingError("Non-canonical: leading zeros in length encoding") -def _check_bounds(data: bytes, end: int) -> None: - """Verify end offset is within data bounds.""" + length = int.from_bytes(data[length_start:length_end], "big") + + # Canonicalization check: payloads that fit the short form must use it. + # + # Any length up to 55 has a short-form prefix between base and base+55. + # Wrapping such a payload in long form would be a second valid encoding. + # + # Example: input b8 37 [55 "a" bytes] is rejected. + # The shorter equivalent b7 [55 "a" bytes] is the canonical form. + if length <= SHORT_FORM_MAX: + kind = "string" if base == STRING_BASE else "list" + raise RLPDecodingError(f"Non-canonical: long {kind} encoding for short {kind}") + + start = length_end + end = start + length if end > len(data): raise RLPDecodingError(f"Data too short: need {end}, have {len(data)}") + return start, end diff --git a/tests/lean_spec/types/test_rlp.py b/tests/lean_spec/types/test_rlp.py index 0d835811..eb421b7a 100644 --- a/tests/lean_spec/types/test_rlp.py +++ b/tests/lean_spec/types/test_rlp.py @@ -5,13 +5,6 @@ import pytest from lean_spec.types.rlp import ( - LONG_LIST_BASE, - LONG_STRING_BASE, - SHORT_LIST_MAX_LEN, - SHORT_LIST_PREFIX, - SHORT_STRING_MAX_LEN, - SHORT_STRING_PREFIX, - SINGLE_BYTE_MAX, RLPDecodingError, RLPItem, decode_rlp, @@ -19,10 +12,50 @@ encode_rlp, ) -# Derived constants for test assertions. -# Long encoding prefixes are BASE + 1 (for 1-byte length). -LONG_STRING_PREFIX = LONG_STRING_BASE + 1 # 0xB8 -LONG_LIST_PREFIX = LONG_LIST_BASE + 1 # 0xF8 +# RLP spec boundaries (Yellow Paper Appendix B). +# Inlined here to match the wire-format literals used throughout the spec. + +SINGLE_BYTE_MAX = 0x7F +"""Largest value that encodes as itself with no prefix. + +Bytes above this need a length prefix to stay distinguishable from prefix bytes. +""" + +SHORT_STRING_PREFIX = 0x80 +"""Base byte for short string prefixes. + +A short string's prefix is this base plus its length, so the range runs 0x80 through 0xB7. +""" + +SHORT_STRING_MAX_LEN = 55 +"""Largest payload length that fits the short string range. + +Anything larger switches to the long form, where the length itself follows the prefix. +""" + +LONG_STRING_PREFIX = 0xB8 +"""First byte of the long string range. + +Sits one above the last short-string prefix, marking the boundary between short and long forms. +""" + +SHORT_LIST_PREFIX = 0xC0 +"""Base byte for short list prefixes. + +A short list's prefix is this base plus its payload length, so the range runs 0xC0 through 0xF7. +""" + +SHORT_LIST_MAX_LEN = 55 +"""Largest payload length that fits the short list range. + +Anything larger switches to the long form, where the length itself follows the prefix. +""" + +LONG_LIST_PREFIX = 0xF8 +"""First byte of the long list range. + +Sits one above the last short-list prefix, marking the boundary between short and long forms. +""" class TestEncodeEmptyString: @@ -263,22 +296,22 @@ class TestEncodeTypeErrors: def test_encode_invalid_type_int(self) -> None: """Encoding an integer directly raises TypeError.""" - with pytest.raises(TypeError, match=r"Cannot RLP encode type: int"): + with pytest.raises(TypeError, match=r"^Cannot RLP encode type: int$"): encode_rlp(42) # type: ignore[arg-type] def test_encode_invalid_type_str(self) -> None: """Encoding a string directly raises TypeError.""" - with pytest.raises(TypeError, match=r"Cannot RLP encode type: str"): + with pytest.raises(TypeError, match=r"^Cannot RLP encode type: str$"): encode_rlp("hello") # type: ignore[arg-type] def test_encode_invalid_type_none(self) -> None: """Encoding None raises TypeError.""" - with pytest.raises(TypeError, match=r"Cannot RLP encode type: NoneType"): + with pytest.raises(TypeError, match=r"^Cannot RLP encode type: NoneType$"): encode_rlp(None) # type: ignore[arg-type] def test_encode_invalid_nested_type(self) -> None: """Encoding a list with invalid nested type raises TypeError.""" - with pytest.raises(TypeError, match=r"Cannot RLP encode type: int"): + with pytest.raises(TypeError, match=r"^Cannot RLP encode type: int$"): encode_rlp([b"valid", 123]) # type: ignore[list-item] @@ -412,59 +445,87 @@ class TestDecodeErrors: def test_decode_empty_data(self) -> None: """Decoding empty data raises RLPDecodingError.""" - with pytest.raises(RLPDecodingError, match=r"Empty RLP data"): + with pytest.raises(RLPDecodingError, match=r"^Empty RLP data$"): decode_rlp(b"") def test_decode_trailing_data(self) -> None: """Extra bytes after valid RLP raise RLPDecodingError.""" # Valid empty string (0x80) followed by extra byte - with pytest.raises(RLPDecodingError, match=r"Trailing data"): + with pytest.raises(RLPDecodingError, match=r"^Trailing data: decoded 1 of 2 bytes$"): decode_rlp(bytes.fromhex("8000")) def test_decode_short_string_truncated(self) -> None: """Truncated short string raises RLPDecodingError.""" # 0x83 indicates 3-byte string, but only 2 bytes provided - with pytest.raises(RLPDecodingError, match=r"Data too short"): + with pytest.raises(RLPDecodingError, match=r"^Data too short: need 4, have 3$"): decode_rlp(bytes.fromhex("836465")) # "de" instead of "dog" def test_decode_long_string_truncated_length(self) -> None: """Truncated length field in long string raises RLPDecodingError.""" # 0xb9 indicates 2-byte length, but only 1 byte provided - with pytest.raises(RLPDecodingError, match=r"Data too short"): + with pytest.raises(RLPDecodingError, match=r"^Data too short: need 3, have 2$"): decode_rlp(bytes.fromhex("b904")) def test_decode_long_string_truncated_payload(self) -> None: """Truncated payload in long string raises RLPDecodingError.""" # 0xb838 indicates 56 bytes, but insufficient data provided - with pytest.raises(RLPDecodingError, match=r"Data too short"): + with pytest.raises(RLPDecodingError, match=r"^Data too short: need 58, have 4$"): decode_rlp(bytes.fromhex("b8380000")) # Only 2 bytes of payload def test_decode_short_list_truncated(self) -> None: """Truncated short list raises RLPDecodingError.""" # 0xc3 indicates 3-byte payload, but only 2 bytes provided - with pytest.raises(RLPDecodingError, match=r"Data too short"): + with pytest.raises(RLPDecodingError, match=r"^Data too short: need 4, have 3$"): decode_rlp(bytes.fromhex("c38080")) def test_decode_long_list_truncated_length(self) -> None: """Truncated length field in long list raises RLPDecodingError.""" # 0xf9 indicates 2-byte length, but only 1 byte provided - with pytest.raises(RLPDecodingError, match=r"Data too short"): + with pytest.raises(RLPDecodingError, match=r"^Data too short: need 3, have 2$"): decode_rlp(bytes.fromhex("f904")) def test_decode_non_canonical_long_string_for_short(self) -> None: """Using long string encoding for short string is non-canonical.""" # 0xb801 indicates long string with 1-byte length containing 0x38 (56) # but 0x38 <= 55, so this should be encoded as short string - with pytest.raises(RLPDecodingError, match=r"Non-canonical.*long string"): + expected = r"^Non-canonical: long string encoding for short string$" + with pytest.raises(RLPDecodingError, match=expected): # 0xb8 followed by length 0x37 (55) - should have used short encoding decode_rlp(bytes.fromhex("b837") + b"a" * 55) def test_decode_non_canonical_long_list_for_short(self) -> None: """Using long list encoding for short list is non-canonical.""" # 0xf8 followed by length 0x37 (55) - should have used short encoding - with pytest.raises(RLPDecodingError, match=r"Non-canonical.*long list"): + expected = r"^Non-canonical: long list encoding for short list$" + with pytest.raises(RLPDecodingError, match=expected): decode_rlp(bytes.fromhex("f837") + bytes.fromhex("80") * 55) + def test_decode_non_canonical_leading_zeros_long_string(self) -> None: + """Long string whose multi-byte length carries a leading zero is non-canonical.""" + # 0xb9 marks a long string with two length bytes. + # The length bytes 00 38 decode to 56. + # The leading zero is redundant since the canonical form is 0xb8 38 with one length byte. + expected = r"^Non-canonical: leading zeros in length encoding$" + with pytest.raises(RLPDecodingError, match=expected): + decode_rlp(bytes.fromhex("b90038") + b"a" * 56) + + def test_decode_non_canonical_leading_zeros_long_list(self) -> None: + """Long list whose multi-byte length carries a leading zero is non-canonical.""" + # 0xf9 marks a long list with two length bytes. + # The length bytes 00 38 decode to 56. + # The leading zero is redundant since the canonical form is 0xf8 38 with one length byte. + expected = r"^Non-canonical: leading zeros in length encoding$" + with pytest.raises(RLPDecodingError, match=expected): + decode_rlp(bytes.fromhex("f90038") + bytes.fromhex("80") * 56) + + def test_decode_list_payload_length_mismatch(self) -> None: + """Inner item that overshoots the parent list boundary is rejected.""" + # Outer list 0xc3 declares three payload bytes. + # Inner short string 0x85 declares five data bytes. + # Those five bytes fit in the buffer but extend past the outer list's payload end. + with pytest.raises(RLPDecodingError, match=r"^List payload length mismatch$"): + decode_rlp(bytes.fromhex("c3856161616161")) + class TestDecodeListFunction: """Tests for the decode_list convenience function.""" @@ -476,12 +537,13 @@ def test_decode_list_success(self) -> None: def test_decode_list_not_a_list(self) -> None: """decode_list raises error when data is not a list.""" - with pytest.raises(RLPDecodingError, match=r"Expected RLP list"): + with pytest.raises(RLPDecodingError, match=r"^Expected RLP list$"): decode_rlp_list(bytes.fromhex("83646f67")) # Encodes "dog", not a list def test_decode_list_nested_list_rejected(self) -> None: """decode_list raises error when list contains nested lists.""" - with pytest.raises(RLPDecodingError, match=r"Element .* is not bytes"): + # First element of [[[], []], []] is the inner list at index 0. + with pytest.raises(RLPDecodingError, match=r"^Element 0 is not bytes$"): decode_rlp_list(bytes.fromhex("c4c2c0c0c0")) # [[[], []], []] @@ -639,14 +701,14 @@ class TestBoundaryConditions: """Tests for boundary conditions based on module constants.""" def test_single_byte_max_boundary(self) -> None: - """Verify SINGLE_BYTE_MAX boundary (0x7f vs 0x80).""" + """Verify single-byte boundary (0x7f vs 0x80).""" # 0x7f = single byte encoding assert encode_rlp(bytes([SINGLE_BYTE_MAX])) == bytes([SINGLE_BYTE_MAX]) # 0x80 = short string encoding assert encode_rlp(bytes([SINGLE_BYTE_MAX + 1])) == bytes([0x81, 0x80]) def test_short_string_max_boundary(self) -> None: - """Verify SHORT_STRING_MAX_LEN boundary (55 vs 56 bytes).""" + """Verify short-string boundary (55 vs 56 bytes).""" # 55 bytes = short string encoding (prefix 0xb7) data_55 = b"a" * SHORT_STRING_MAX_LEN encoded_55 = encode_rlp(data_55) @@ -658,7 +720,7 @@ def test_short_string_max_boundary(self) -> None: assert encoded_56[0] == LONG_STRING_PREFIX # 0xb8 def test_short_list_max_boundary(self) -> None: - """Verify SHORT_LIST_MAX_LEN boundary (55 vs 56 bytes payload).""" + """Verify short-list boundary (55 vs 56 bytes payload).""" # 55 bytes payload = short list encoding (prefix 0xf7) items_55: list[RLPItem] = [b"a" for _ in range(SHORT_LIST_MAX_LEN)] encoded_55 = encode_rlp(items_55) @@ -671,7 +733,7 @@ def test_short_list_max_boundary(self) -> None: def test_prefix_boundaries(self) -> None: """Verify prefix range boundaries from RLP spec.""" - # Verify constants match RLP specification + # Verify literals match RLP specification assert SHORT_STRING_PREFIX == 0x80 assert LONG_STRING_PREFIX == 0xB8 assert SHORT_LIST_PREFIX == 0xC0