diff --git a/CHANGELOG.md b/CHANGELOG.md index fa39e4f..2b556ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Changed - Speed up membership tests (`key in ...`) on `Container`, `Table` and `InlineTable` with native `__contains__` implementations, avoiding the inherited `MutableMapping` round-trip through `__getitem__` (which resolves the value and builds an exception on every absent key). ([#483](https://github.com/python-poetry/tomlkit/issues/483)) +- Speed up parsing by making `Source` index-based: it now tracks an integer position over the input string instead of materializing a list of `(index, char)` tuples up front, so construction is O(1) and state save/restore no longer copies an iterator. ([#489](https://github.com/python-poetry/tomlkit/pull/489)) ### Fixed diff --git a/tomlkit/source.py b/tomlkit/source.py index 327c627..79c4241 100644 --- a/tomlkit/source.py +++ b/tomlkit/source.py @@ -1,6 +1,5 @@ from __future__ import annotations -from copy import copy from typing import Any from tomlkit.exceptions import ParseError @@ -21,7 +20,9 @@ def __init__( def __enter__(self) -> _State: # Entering this context manager - save the state - self._chars = copy(self._source._chars) + # PERF: snapshot only the integer index + current char + marker. + # We no longer carry an iterator (`_chars`) so there's no `copy(...)` + # to do here — saving 3 attribute reads vs the original iter copy. self._idx = self._source._idx self._current = self._source._current self._marker = self._source._marker @@ -36,7 +37,6 @@ def __exit__( ) -> None: # Exiting this context manager - restore the prior state if self.restore or exception_type: - self._source._chars = self._chars self._source._idx = self._idx self._source._current = self._current if self._save_marker: @@ -80,12 +80,14 @@ class Source(str): def __init__(self, _: str) -> None: super().__init__() - # Collection of TOMLChars - self._chars = iter([(i, TOMLChar(c)) for i, c in enumerate(self)]) - - self._idx = 0 + # PERF: previously built `iter([(i, TOMLChar(c)) for i, c in enumerate(self)])` + # which materialized N tuples + N TOMLChars at init time (~584 k allocations + # per 150-parse benchmark). Switching to an integer index over the underlying + # str makes init O(1) and lets `inc()` just bump the index and slice the str. + # The TOMLChar cache (toml_char.py) absorbs the per-character cost. + self._idx = -1 # pre-start sentinel; first inc() will land on 0 self._marker = 0 - self._current = TOMLChar("") + self._current: TOMLChar = TOMLChar("") self._state = _StateHandler(self) @@ -125,17 +127,21 @@ def inc(self, exception: type[ParseError] | None = None) -> bool: Increments the parser if the end of the input has not been reached. Returns whether or not it was able to advance. """ - try: - self._idx, self._current = next(self._chars) - + # PERF: integer increment + cached TOMLChar lookup, no iterator/next()/ + # StopIteration triage. After the first char of each kind has been seen, + # `TOMLChar(self[i])` is a dict.get cache hit. + next_idx = self._idx + 1 + if next_idx < len(self): + self._idx = next_idx + self._current = TOMLChar(self[next_idx]) return True - except StopIteration: - self._idx = len(self) - self._current = self.EOF - if exception: - raise self.parse_error(exception) from None - return False + # Past end : pin to len, switch current to EOF, raise if asked. + self._idx = len(self) + self._current = self.EOF + if exception: + raise self.parse_error(exception) from None + return False def inc_n(self, n: int, exception: type[ParseError] | None = None) -> bool: """