Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
### Changed

- Speed up membership tests (`key in ...`) on `Container`, `Table` and `InlineTable` with native `__contains__` implementations, avoiding the inherited `MutableMapping` round-trip through `__getitem__` (which resolves the value and builds an exception on every absent key). ([#483](https://github.com/python-poetry/tomlkit/issues/483))
- Speed up parsing by making `Source` index-based: it now tracks an integer position over the input string instead of materializing a list of `(index, char)` tuples up front, so construction is O(1) and state save/restore no longer copies an iterator. ([#489](https://github.com/python-poetry/tomlkit/pull/489))
- Speed up parsing by scanning character runs in bulk: `Source.advance_while`/`advance_until` consume a whole run of whitespace, bare-key or number characters in a single pass over the input string instead of one `inc()` call per character. ([#490](https://github.com/python-poetry/tomlkit/pull/490))
- Speed up parsing of single-line strings by bulk-appending the run of ordinary characters up to the next delimiter, backslash or control character in one pass, instead of one character at a time. ([#491](https://github.com/python-poetry/tomlkit/pull/491))
- Speed up parsing by removing the internal `TOMLChar` wrapper: the parser now reads plain `str` characters from `Source` and detects end-of-input positionally, avoiding a per-character object construction and method dispatch. ([#492](https://github.com/python-poetry/tomlkit/pull/492))

### Fixed

Expand Down
103 changes: 71 additions & 32 deletions tomlkit/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
from tomlkit.items import Whitespace
from tomlkit.source import Source
from tomlkit.source import _StateHandler
from tomlkit.toml_char import TOMLChar
from tomlkit.toml_document import TOMLDocument


Expand All @@ -57,6 +56,33 @@
CTRL_CHAR_LIMIT = 0x1F
CHR_DEL = 0x7F

# TOML character classes (formerly the `TOMLChar` constants). The parser works on
# plain 1-char `str`s read from `Source`; membership tests use these frozensets.
_SPACES = " \t"
_NL = "\n\r"
_WS = _SPACES + _NL
_BARE = string.ascii_letters + string.digits + "-_"
_KV = "= \t"

_SPACES_SET = frozenset(_SPACES)
_NL_SET = frozenset(_NL)
_WS_SET = frozenset(_WS)
_KV_SET = frozenset(_KV)
# Character sets for Source.advance_while / advance_until bulk run scans
# (replace per-character `while self._current in <set> and self.inc()` loops with
# a single underlying-string scan).
_BARE_KEY_OR_SPACE = frozenset(_BARE + _SPACES)
_NUM_STOP = frozenset(" \t\n\r#,]}")
_DATE_TAIL_STOP = frozenset("\t\n\r#,]}")
# Control chars invalid inside a single-line string (DEL + everything <= 0x1F
# except tab) — exactly the set that raises InvalidControlChar in the per-char
# string loop. The single-line string-body fast-path stops its bulk scan at the
# first delimiter / backslash / control char, then the main loop handles that
# char with its existing branch (raising InvalidControlChar where needed).
_CTRL_SINGLE = frozenset(chr(c) for c in range(0x20) if c != CTRL_I) | {chr(CHR_DEL)}
_SINGLE_LITERAL_STOP = _CTRL_SINGLE | {"'"} # literal: only the closing quote
_SINGLE_BASIC_STOP = _CTRL_SINGLE | {'"', "\\"} # basic: quote or escape


class Parser:
"""
Expand All @@ -78,7 +104,7 @@ def _idx(self) -> int:
return self._src.idx

@property
def _current(self) -> TOMLChar:
def _current(self) -> str:
return self._src.current

@property
Expand Down Expand Up @@ -276,7 +302,7 @@ def _parse_comment_trail(self, parse_trail: bool = True) -> tuple[str, str, str]
self.inc() # Skip #

# The comment itself
while not self.end() and not self._current.is_nl():
while not self.end() and self._current not in _NL_SET:
code = ord(self._current)
if code == CHR_DEL or (code <= CTRL_CHAR_LIMIT and code != CTRL_I):
raise self.parse_error(InvalidControlChar, code, "comments")
Expand Down Expand Up @@ -304,8 +330,7 @@ def _parse_comment_trail(self, parse_trail: bool = True) -> tuple[str, str, str]

trail = ""
if parse_trail:
while self._current.is_spaces() and self.inc():
pass
self._src.advance_while(_SPACES_SET)

if self._current == "\r":
with self._state(restore=True):
Expand All @@ -316,7 +341,7 @@ def _parse_comment_trail(self, parse_trail: bool = True) -> tuple[str, str, str]
if self._current == "\n":
self.inc()

if self._idx != self._marker or self._current.is_ws():
if self._idx != self._marker or self._current in _WS_SET:
trail = self.extract()

return comment_ws, comment, trail
Expand All @@ -325,8 +350,7 @@ def _parse_key_value(self, parse_comment: bool = False) -> tuple[Key, Item]:
# Leading indent
self.mark()

while self._current.is_spaces() and self.inc():
pass
self._src.advance_while(_SPACES_SET)

indent = self.extract()

Expand All @@ -336,7 +360,7 @@ def _parse_key_value(self, parse_comment: bool = False) -> tuple[Key, Item]:
self.mark()

found_equals = self._current == "="
while self._current.is_kv_sep() and self.inc():
while self._current in _KV_SET and self.inc():
if self._current == "=":
if found_equals:
raise self.parse_error(UnexpectedCharError, "=")
Expand Down Expand Up @@ -374,9 +398,8 @@ def _parse_key(self) -> Key:
WS before the key must be exhausted first at the callsite.
"""
self.mark()
while self._current.is_spaces() and self.inc():
# Skip any leading whitespace
pass
# Skip any leading whitespace (bulk scan)
self._src.advance_while(_SPACES_SET)
if self._current in "\"'":
return self._parse_quoted_key()
else:
Expand All @@ -401,8 +424,7 @@ def _parse_quoted_key(self) -> Key:
raise self.parse_error(UnexpectedCharError, key_str._t.value)
original += key_str.as_string()
self.mark()
while self._current.is_spaces() and self.inc():
pass
self._src.advance_while(_SPACES_SET)
original += self.extract()
result: Key = SingleKey(str(key_str), t=key_type, sep="", original=original)
if self._current == ".":
Expand All @@ -415,10 +437,7 @@ def _parse_bare_key(self) -> Key:
"""
Parses a bare key.
"""
while (
self._current.is_bare_key_char() or self._current.is_spaces()
) and self.inc():
pass
self._src.advance_while(_BARE_KEY_OR_SPACE)

original = self.extract()
key_s = original.strip()
Expand Down Expand Up @@ -467,8 +486,7 @@ def _parse_value(self) -> Item:
"nan",
}:
# Number
while self._current not in " \t\n\r#,]}" and self.inc():
pass
self._src.advance_until(_NUM_STOP)

raw = self.extract()

Expand All @@ -479,8 +497,7 @@ def _parse_value(self) -> Item:
raise self.parse_error(InvalidNumberError)
elif c in string.digits:
# Integer, Float, Date, Time or DateTime
while self._current not in " \t\n\r#,]}" and self.inc():
pass
self._src.advance_until(_NUM_STOP)

raw = self.extract()

Expand Down Expand Up @@ -512,8 +529,7 @@ def _parse_value(self) -> Item:
assert isinstance(dt, datetime.date)
date = Date(dt.year, dt.month, dt.day, trivia, raw)
self.mark()
while self._current not in "\t\n\r#,]}" and self.inc():
pass
self._src.advance_until(_DATE_TAIL_STOP)

time_raw = self.extract()
time_part = time_raw.rstrip()
Expand Down Expand Up @@ -588,9 +604,9 @@ def _parse_array(self) -> Array:
while True:
# consume whitespace
mark = self._idx
self.consume(TOMLChar.SPACES + TOMLChar.NL)
self.consume(_WS)
indent = self._src[mark : self._idx]
newline = set(TOMLChar.NL) & set(indent)
newline = _NL_SET & set(indent)
if newline:
elems.append(Whitespace(indent))
continue
Expand Down Expand Up @@ -653,7 +669,7 @@ def _parse_inline_table(self) -> InlineTable:
while True:
# consume whitespace and newlines
mark = self._idx
self.consume(TOMLChar.SPACES + TOMLChar.NL)
self.consume(_WS)
raw = self._src[mark : self._idx]
if raw:
elems.add(Whitespace(raw))
Expand Down Expand Up @@ -743,7 +759,7 @@ def _parse_basic_string(self) -> String:
return self._parse_string(StringType.SLB)

def _parse_escaped_char(self, multiline: bool) -> str:
if multiline and self._current.is_ws():
if multiline and self._current in _WS_SET:
# When the last non-whitespace character on a line is
# a \, it will be trimmed along with all whitespace
# (including newlines) up to the next non-whitespace
Expand All @@ -752,7 +768,7 @@ def _parse_escaped_char(self, multiline: bool) -> str:
# hello \
# world"""
tmp = ""
while self._current.is_ws():
while self._current in _WS_SET:
tmp += self._current
# consume the whitespace, EOF here is an issue
# (middle of string)
Expand Down Expand Up @@ -838,6 +854,15 @@ def _parse_string(self, delim: StringType) -> String:
if cur == "\r\n":
self.inc_n(2, exception=UnexpectedEofError)

# PERF: stop-set for the single-line string-body bulk fast-path (None for
# multiline, which keeps the per-char loop because of \r\n handling).
src = self._src
single_stop = None
if delim.is_singleline():
single_stop = (
_SINGLE_BASIC_STOP if delim.is_basic() else _SINGLE_LITERAL_STOP
)

escaped = False # whether the previous key was ESCAPE
while True:
code = ord(self._current)
Expand Down Expand Up @@ -913,10 +938,24 @@ def _parse_string(self, delim: StringType) -> String:
else:
# this is either a literal string where we keep everything as is,
# or this is not a special escaped char in a basic string
value += self._current
if single_stop is not None:
# PERF fast-path: bulk-append the run of ordinary characters
# up to the next delimiter / backslash / control char, instead
# of one `value += cur; inc()` iteration per character. The
# stop char is then handled by the branches above on the next
# iteration (single-line only; multiline keeps the per-char
# loop for CRLF handling).
run_start = src._idx
src.advance_until(single_stop)
if src.end():
# mid-string EOF — same error as the per-char inc()
raise self.parse_error(UnexpectedEofError)
value += src[run_start : src._idx]
else:
value += self._current

# consume this char, EOF here is an issue (middle of string)
self.inc(exception=UnexpectedEofError)
# consume this char, EOF here is an issue (middle of string)
self.inc(exception=UnexpectedEofError)

def _parse_table(
self, parent_name: Key | None = None, parent: Table | None = None
Expand Down
Loading
Loading