From 5949b835c690e29acdcb9328d07101b7a0a670f7 Mon Sep 17 00:00:00 2001 From: David Hotham Date: Sun, 22 Mar 2026 20:02:25 +0000 Subject: [PATCH 1/7] perf: eliminate copy from super-table merge path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When merging super-tables (e.g. [tool.ruff] into existing [tool]), the parser was deep-copying the entire existing table just to append new entries alongside it. Since TOML forbids duplicate keys, the existing items are never modified — we can simply mutate the table in place, appending new entries directly. The only place that needed protection was the out-of-order table validation path (OutOfOrderTableProxy.validate), which creates a temp container and re-merges fragments to check for conflicts. Move the (shallow) copy there — it's a rare path that only runs when tables of the same name are separated by unrelated tables. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tomlkit/container.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/tomlkit/container.py b/tomlkit/container.py index 4387471..bb00587 100644 --- a/tomlkit/container.py +++ b/tomlkit/container.py @@ -258,17 +258,10 @@ def append( return self - # Create a new element to replace the old one - current = copy.deepcopy(current) + # Merge the new super table's entries into + # the existing one. for k, v in item.value.body: current.append(k, v) - self._body[ - ( - current_idx[-1] - if isinstance(current_idx, tuple) - else current_idx - ) - ] = (current_body_element[0], current) return self elif ( @@ -856,6 +849,7 @@ def __copy__(self) -> Self: c._body += self.body c._map.update(self._map) + c._table_keys = list(self._table_keys) return c @@ -885,13 +879,16 @@ class OutOfOrderTableProxy(_CustomDict): # type: ignore[type-arg] @staticmethod def validate(container: Container, indices: tuple[int, ...]) -> None: """Validate out of order tables in the given container""" - # Append all items to a temp container to see if there is any error + # Append all items to a temp container to see if there is any error. + # Shallow-copy Tables so the merge path doesn't mutate originals. temp_container = Container(True) for i in indices: _, item = container._body[i] if isinstance(item, Table): for k, v in item.value.body: + if isinstance(v, Table): + v = copy.copy(v) temp_container.append(k, v, validate=True) temp_container._validate_out_of_order_table() From 1231c3b54c101472242dc602a4607f8c15834b92 Mon Sep 17 00:00:00 2001 From: David Hotham Date: Sun, 22 Mar 2026 19:52:02 +0000 Subject: [PATCH 2/7] perf: optimize parser hot paths - StringType.is_*() methods: replace set membership tests with identity comparison ("self is X or self is Y"). Enum singletons make "is" correct and it avoids creating a temporary set and hashing enum values on every call. These methods are called per-character during string parsing. - Parser._current/_idx/_marker: access Source._current etc directly, bypassing an unnecessary property indirection layer. With ~2M accesses per parse this eliminates millions of redundant function calls. - _parse_string: hoist loop-invariant delim.is_singleline(), delim.is_multiline(), delim.is_basic(), and delim.unit into local variables before the per-character loop. The delimiter type never changes within the loop (it is set once, after the opening delimiter is consumed). - Use tuple instead of list for "in" check on control char codes (tuples are faster for containment tests). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tomlkit/items.py | 8 ++++---- tomlkit/parser.py | 36 +++++++++++++++++++++--------------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/tomlkit/items.py b/tomlkit/items.py index 8670f03..28acb8b 100644 --- a/tomlkit/items.py +++ b/tomlkit/items.py @@ -280,16 +280,16 @@ def unit(self) -> str: return self.value[0] def is_basic(self) -> bool: - return self in {StringType.SLB, StringType.MLB} + return self is StringType.SLB or self is StringType.MLB def is_literal(self) -> bool: - return self in {StringType.SLL, StringType.MLL} + return self is StringType.SLL or self is StringType.MLL def is_singleline(self) -> bool: - return self in {StringType.SLB, StringType.SLL} + return self is StringType.SLB or self is StringType.SLL def is_multiline(self) -> bool: - return self in {StringType.MLB, StringType.MLL} + return self is StringType.MLB or self is StringType.MLL def toggle(self) -> StringType: return { diff --git a/tomlkit/parser.py b/tomlkit/parser.py index 538ed03..4ec720e 100644 --- a/tomlkit/parser.py +++ b/tomlkit/parser.py @@ -71,19 +71,19 @@ def __init__(self, string: str | bytes) -> None: @property def _state(self) -> _StateHandler: - return self._src.state + return self._src._state @property def _idx(self) -> int: - return self._src.idx + return self._src._idx @property def _current(self) -> TOMLChar: - return self._src.current + return self._src._current @property def _marker(self) -> int: - return self._src.marker + return self._src._marker def extract(self) -> str: """ @@ -825,8 +825,14 @@ def _parse_string(self, delim: StringType) -> String: self.mark() # to extract the original string with whitespace and all value = "" + # Pre-compute delim properties — these are constant through the loop + delim_is_singleline = delim.is_singleline() + delim_is_multiline = delim.is_multiline() + delim_is_basic = delim.is_basic() + delim_unit = delim.unit + # A newline immediately following the opening delimiter will be trimmed. - if delim.is_multiline(): + if delim_is_multiline: if self._current == "\n": # consume the newline, EOF here is an issue (middle of string) self.inc(exception=UnexpectedEofError) @@ -842,33 +848,33 @@ def _parse_string(self, delim: StringType) -> String: while True: code = ord(self._current) if ( - delim.is_singleline() + delim_is_singleline and not escaped and (code == CHR_DEL or (code <= CTRL_CHAR_LIMIT and code != CTRL_I)) ) or ( - delim.is_multiline() + delim_is_multiline and not escaped and ( code == CHR_DEL or ( - code <= CTRL_CHAR_LIMIT and code not in [CTRL_I, CTRL_J, CTRL_M] + code <= CTRL_CHAR_LIMIT and code not in (CTRL_I, CTRL_J, CTRL_M) ) ) ): raise self.parse_error(InvalidControlChar, code, "strings") - elif delim.is_multiline() and not escaped and self._current == "\r": + elif delim_is_multiline and not escaped and self._current == "\r": with self._state(restore=True): if not self.inc() or self._current != "\n": raise self.parse_error(InvalidControlChar, CTRL_M, "strings") - elif not escaped and self._current == delim.unit: + elif not escaped and self._current == delim_unit: # try to process current as a closing delim original = self.extract() close = "" - if delim.is_multiline(): + if delim_is_multiline: # Consume the delimiters to see if we are at the end of the string close = "" - while self._current == delim.unit: + while self._current == delim_unit: close += self._current self.inc() @@ -895,14 +901,14 @@ def _parse_string(self, delim: StringType) -> String: self.inc() return String(delim, value, original, Trivia()) - elif delim.is_basic() and escaped: + elif delim_is_basic and escaped: # attempt to parse the current char as an escaped value, an exception # is raised if this fails - value += self._parse_escaped_char(delim.is_multiline()) + value += self._parse_escaped_char(delim_is_multiline) # no longer escaped escaped = False - elif delim.is_basic() and self._current == "\\": + elif delim_is_basic and self._current == "\\": # the next char is being escaped escaped = True From c58785d2f92e77be7123aa7f7c629b127ff37b48 Mon Sep 17 00:00:00 2001 From: David Hotham Date: Sun, 22 Mar 2026 20:20:55 +0000 Subject: [PATCH 3/7] perf: eliminate TOMLChar class and iterator-based source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the TOMLChar wrapper class with plain module-level string constants (BARE, KV, NUMBER, SPACES, NL, WS). Character class checks become simple `c in CONSTANT` instead of method calls on a str subclass, eliminating 710k object creations per parse. Switch Source from an iterator over a pre-built list of (int, TOMLChar) tuples to direct string indexing. The _State context manager now saves/restores just _idx, _current, and _marker — three scalar assignments instead of copying a list iterator. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tomlkit/parser.py | 39 +++++++++++++++--------------- tomlkit/source.py | 48 ++++++++++++++----------------------- tomlkit/toml_char.py | 56 +++++--------------------------------------- 3 files changed, 43 insertions(+), 100 deletions(-) diff --git a/tomlkit/parser.py b/tomlkit/parser.py index 4ec720e..8cc78d6 100644 --- a/tomlkit/parser.py +++ b/tomlkit/parser.py @@ -47,7 +47,11 @@ from tomlkit.items import Whitespace from tomlkit.source import Source from tomlkit.source import _StateHandler -from tomlkit.toml_char import TOMLChar +from tomlkit.toml_char import BARE +from tomlkit.toml_char import KV +from tomlkit.toml_char import NL +from tomlkit.toml_char import SPACES +from tomlkit.toml_char import WS from tomlkit.toml_document import TOMLDocument @@ -78,7 +82,7 @@ def _idx(self) -> int: return self._src._idx @property - def _current(self) -> TOMLChar: + def _current(self) -> str: return self._src._current @property @@ -276,7 +280,7 @@ def _parse_comment_trail(self, parse_trail: bool = True) -> tuple[str, str, str] self.inc() # Skip # # The comment itself - while not self.end() and not self._current.is_nl(): + while not self.end() and self._current not in NL: code = ord(self._current) if code == CHR_DEL or (code <= CTRL_CHAR_LIMIT and code != CTRL_I): raise self.parse_error(InvalidControlChar, code, "comments") @@ -304,7 +308,7 @@ def _parse_comment_trail(self, parse_trail: bool = True) -> tuple[str, str, str] trail = "" if parse_trail: - while self._current.is_spaces() and self.inc(): + while self._current in SPACES and self.inc(): pass if self._current == "\r": @@ -316,7 +320,7 @@ def _parse_comment_trail(self, parse_trail: bool = True) -> tuple[str, str, str] if self._current == "\n": self.inc() - if self._idx != self._marker or self._current.is_ws(): + if self._idx != self._marker or self._current in WS: trail = self.extract() return comment_ws, comment, trail @@ -325,7 +329,7 @@ def _parse_key_value(self, parse_comment: bool = False) -> tuple[Key, Item]: # Leading indent self.mark() - while self._current.is_spaces() and self.inc(): + while self._current in SPACES and self.inc(): pass indent = self.extract() @@ -336,7 +340,7 @@ def _parse_key_value(self, parse_comment: bool = False) -> tuple[Key, Item]: self.mark() found_equals = self._current == "=" - while self._current.is_kv_sep() and self.inc(): + while self._current in KV and self.inc(): if self._current == "=": if found_equals: raise self.parse_error(UnexpectedCharError, "=") @@ -374,8 +378,7 @@ def _parse_key(self) -> Key: WS before the key must be exhausted first at the callsite. """ self.mark() - while self._current.is_spaces() and self.inc(): - # Skip any leading whitespace + while self._current in SPACES and self.inc(): pass if self._current in "\"'": return self._parse_quoted_key() @@ -401,7 +404,7 @@ def _parse_quoted_key(self) -> Key: raise self.parse_error(UnexpectedCharError, key_str._t.value) original += key_str.as_string() self.mark() - while self._current.is_spaces() and self.inc(): + while self._current in SPACES and self.inc(): pass original += self.extract() result: Key = SingleKey(str(key_str), t=key_type, sep="", original=original) @@ -415,9 +418,7 @@ def _parse_bare_key(self) -> Key: """ Parses a bare key. """ - while ( - self._current.is_bare_key_char() or self._current.is_spaces() - ) and self.inc(): + while (self._current in BARE or self._current in SPACES) and self.inc(): pass original = self.extract() @@ -588,9 +589,9 @@ def _parse_array(self) -> Array: while True: # consume whitespace mark = self._idx - self.consume(TOMLChar.SPACES + TOMLChar.NL) + self.consume(SPACES + NL) indent = self._src[mark : self._idx] - newline = set(TOMLChar.NL) & set(indent) + newline = set(NL) & set(indent) if newline: elems.append(Whitespace(indent)) continue @@ -653,7 +654,7 @@ def _parse_inline_table(self) -> InlineTable: while True: # consume whitespace and newlines mark = self._idx - self.consume(TOMLChar.SPACES + TOMLChar.NL) + self.consume(SPACES + NL) raw = self._src[mark : self._idx] if raw: elems.add(Whitespace(raw)) @@ -743,7 +744,7 @@ def _parse_basic_string(self) -> String: return self._parse_string(StringType.SLB) def _parse_escaped_char(self, multiline: bool) -> str: - if multiline and self._current.is_ws(): + if multiline and self._current in WS: # When the last non-whitespace character on a line is # a \, it will be trimmed along with all whitespace # (including newlines) up to the next non-whitespace @@ -752,7 +753,7 @@ def _parse_escaped_char(self, multiline: bool) -> str: # hello \ # world""" tmp = "" - while self._current.is_ws(): + while self._current in WS: tmp += self._current # consume the whitespace, EOF here is an issue # (middle of string) @@ -1134,7 +1135,7 @@ def _peek(self, n: int) -> str: with self._state(restore=True): buf = "" for _ in range(n): - if self._current not in " \t\n\r#,]}" + self._src.EOF: + if self._current not in " \t\n\r#,]}\0": buf += self._current self.inc() continue diff --git a/tomlkit/source.py b/tomlkit/source.py index 327c627..cc943e3 100644 --- a/tomlkit/source.py +++ b/tomlkit/source.py @@ -1,11 +1,9 @@ from __future__ import annotations -from copy import copy from typing import Any from tomlkit.exceptions import ParseError from tomlkit.exceptions import UnexpectedCharError -from tomlkit.toml_char import TOMLChar class _State: @@ -21,7 +19,6 @@ def __init__( def __enter__(self) -> _State: # Entering this context manager - save the state - self._chars = copy(self._source._chars) self._idx = self._source._idx self._current = self._source._current self._marker = self._source._marker @@ -36,7 +33,6 @@ def __exit__( ) -> None: # Exiting this context manager - restore the prior state if self.restore or exception_type: - self._source._chars = self._chars self._source._idx = self._idx self._source._current = self._current if self._save_marker: @@ -75,29 +71,18 @@ def __exit__( class Source(str): - EOF = TOMLChar("\0") + EOF = "\0" def __init__(self, _: str) -> None: super().__init__() - # Collection of TOMLChars - self._chars = iter([(i, TOMLChar(c)) for i, c in enumerate(self)]) - + self._length = len(self) self._idx = 0 self._marker = 0 - self._current = TOMLChar("") + self._current = self[0] if self._length > 0 else self.EOF self._state = _StateHandler(self) - self.inc() - - def reset(self) -> None: - # initialize both idx and current - self.inc() - - # reset marker - self.mark() - @property def state(self) -> _StateHandler: return self._state @@ -107,7 +92,7 @@ def idx(self) -> int: return self._idx @property - def current(self) -> TOMLChar: + def current(self) -> str: return self._current @property @@ -125,17 +110,18 @@ def inc(self, exception: type[ParseError] | None = None) -> bool: Increments the parser if the end of the input has not been reached. Returns whether or not it was able to advance. """ - try: - self._idx, self._current = next(self._chars) - + idx = self._idx + 1 + if idx < self._length: + self._idx = idx + self._current = self[idx] return True - except StopIteration: - self._idx = len(self) - self._current = self.EOF - if exception: - raise self.parse_error(exception) from None - return False + self._idx = self._length + self._current = self.EOF + if exception: + raise self.parse_error(exception) from None + + return False def inc_n(self, n: int, exception: type[ParseError] | None = None) -> bool: """ @@ -148,7 +134,7 @@ def consume(self, chars: str, min: int = 0, max: int = -1) -> None: """ Consume chars until min/max is satisfied is valid. """ - while self.current in chars and max != 0: + while self._current in chars and max != 0: min -= 1 max -= 1 if not self.inc(): @@ -156,13 +142,13 @@ def consume(self, chars: str, min: int = 0, max: int = -1) -> None: # failed to consume minimum number of characters if min > 0: - raise self.parse_error(UnexpectedCharError, self.current) + raise self.parse_error(UnexpectedCharError, self._current) def end(self) -> bool: """ Returns True if the parser has reached the end of the input. """ - return self._current is self.EOF + return self._idx >= self._length def mark(self) -> None: """ diff --git a/tomlkit/toml_char.py b/tomlkit/toml_char.py index 970cbd4..2381afe 100644 --- a/tomlkit/toml_char.py +++ b/tomlkit/toml_char.py @@ -1,52 +1,8 @@ import string - -class TOMLChar(str): - def __init__(self, c: str) -> None: - super().__init__() - - if len(self) > 1: - raise ValueError("A TOML character must be of length 1") - - BARE = string.ascii_letters + string.digits + "-_" - KV = "= \t" - NUMBER = string.digits + "+-_.e" - SPACES = " \t" - NL = "\n\r" - WS = SPACES + NL - - def is_bare_key_char(self) -> bool: - """ - Whether the character is a valid bare key name or not. - """ - return self in self.BARE - - def is_kv_sep(self) -> bool: - """ - Whether the character is a valid key/value separator or not. - """ - return self in self.KV - - def is_int_float_char(self) -> bool: - """ - Whether the character if a valid integer or float value character or not. - """ - return self in self.NUMBER - - def is_ws(self) -> bool: - """ - Whether the character is a whitespace character or not. - """ - return self in self.WS - - def is_nl(self) -> bool: - """ - Whether the character is a new line character or not. - """ - return self in self.NL - - def is_spaces(self) -> bool: - """ - Whether the character is a space or not - """ - return self in self.SPACES +BARE = string.ascii_letters + string.digits + "-_" +KV = "= \t" +NUMBER = string.digits + "+-_.e" +SPACES = " \t" +NL = "\n\r" +WS = SPACES + NL From 03c73af6a36b5ae3946f05f7de2975722f97564b Mon Sep 17 00:00:00 2001 From: David Hotham Date: Sun, 22 Mar 2026 20:49:18 +0000 Subject: [PATCH 4/7] perf: eliminate unnecessary object creation in hot paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three structural fixes that remove redundant work during parsing: 1. Table.raw_append: after appending a dotted key, used to read the value back via Container.__getitem__ (which creates a throwaway SingleKey, searches the map, etc). Now uses dict.__getitem__ on the Container directly — same result, no intermediate objects. 2. Container.append: checked 'key in self' which goes through MutableMapping.__contains__ → __getitem__ → item() → SingleKey. Changed to 'key in self._map' — a direct dict lookup on the internal map that already uses Key objects as keys. 3. SingleKey.__init__: the bare-key character check was rebuilding string.ascii_letters + string.digits + '-_' on every call. Now uses the pre-computed BARE constant from toml_char. Together these eliminate ~46k unnecessary SingleKey creations and ~26k unnecessary Container.__getitem__ calls per 500 parses. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tomlkit/container.py | 2 +- tomlkit/items.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tomlkit/container.py b/tomlkit/container.py index bb00587..693c809 100644 --- a/tomlkit/container.py +++ b/tomlkit/container.py @@ -200,7 +200,7 @@ def append( if item and not ("\n" in item[0].trivia.indent or prev_ws): item[0].trivia.indent = "\n" + item[0].trivia.indent - if key is not None and key in self: + if key is not None and key in self._map: current_idx = self._map[key] if isinstance(current_idx, tuple): current_body_element = self._body[current_idx[-1]] diff --git a/tomlkit/items.py b/tomlkit/items.py index 28acb8b..c8c6c5a 100644 --- a/tomlkit/items.py +++ b/tomlkit/items.py @@ -5,7 +5,6 @@ import dataclasses import inspect import re -import string from collections.abc import Collection from collections.abc import Iterable @@ -32,6 +31,7 @@ from tomlkit._utils import escape_string from tomlkit.exceptions import ConvertError from tomlkit.exceptions import InvalidStringError +from tomlkit.toml_char import BARE if TYPE_CHECKING: @@ -404,9 +404,7 @@ def __init__( raise TypeError("Keys must be strings") if t is None: - if not k or any( - c not in string.ascii_letters + string.digits + "-" + "_" for c in k - ): + if not k or any(c not in BARE for c in k): t = KeyType.Basic else: t = KeyType.Bare @@ -1919,7 +1917,9 @@ def raw_append(self, key: Key | str | None, _item: Any) -> Table: if isinstance(key, Key): key = next(iter(key)).key - _item = self._value[key] + # Get the stored value directly from the Container's dict, + # avoiding __getitem__ which would create a throwaway SingleKey. + _item = dict.__getitem__(self._value, key) if key is not None: dict.__setitem__(self, key, _item) From c888700fc3cee8c8357057216612def4a1e2dec7 Mon Sep 17 00:00:00 2001 From: David Hotham Date: Sun, 22 Mar 2026 21:08:49 +0000 Subject: [PATCH 5/7] perf: eliminate double-parsing of table headers and speculative errors Structural changes to avoid unnecessary work: 1. Extract _parse_table_header() and pass pre-parsed headers to _parse_table(), eliminating the peek-then-reparse pattern where every table header was parsed twice (once to peek, once for real). Removes _peek_table() entirely. 2. Reorder _parse_array() to check for closing bracket before attempting to parse a value, eliminating 3 speculative UnexpectedCharError constructions per parse (each requiring an expensive _to_linecol() call that scans the full source). Total function calls reduced from 6.3M to 5.88M per 500 parses. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tomlkit/parser.py | 107 ++++++++++++++++++++++++---------------------- 1 file changed, 57 insertions(+), 50 deletions(-) diff --git a/tomlkit/parser.py b/tomlkit/parser.py index 8cc78d6..762741a 100644 --- a/tomlkit/parser.py +++ b/tomlkit/parser.py @@ -607,6 +607,12 @@ def _parse_array(self) -> Array: elems.append(Whitespace(indent)) continue + # consume closing bracket + if self._current == "]": + # consume closing bracket, EOF here doesn't matter + self.inc() + break + # consume value if not prev_value: try: @@ -627,12 +633,6 @@ def _parse_array(self) -> Array: prev_value = False continue - # consume closing bracket - if self._current == "]": - # consume closing bracket, EOF here doesn't matter - self.inc() - break - raise self.parse_error(UnexpectedCharError, self._current) try: @@ -923,17 +923,13 @@ def _parse_string(self, delim: StringType) -> String: # consume this char, EOF here is an issue (middle of string) self.inc(exception=UnexpectedEofError) - def _parse_table( - self, parent_name: Key | None = None, parent: Table | None = None - ) -> tuple[Key, Table | AoT]: + def _parse_table_header(self) -> tuple[str, bool, Key]: """ - Parses a table element. - """ - if self._current != "[": - raise self.parse_error( - InternalParserError, "_parse_table() called on non-bracket character." - ) + Parses the header of a table ([key] or [[key]]). + Returns (indent, is_aot, key). + Leaves the parser positioned at the closing ']'. + """ indent = self.extract() self.inc() # Skip opening bracket @@ -950,6 +946,28 @@ def _parse_table( key = self._parse_key() except EmptyKeyError: raise self.parse_error(EmptyTableNameError) from None + + return indent, is_aot, key + + def _parse_table( + self, + parent_name: Key | None = None, + parent: Table | None = None, + _header: tuple[str, bool, Key] | None = None, + ) -> tuple[Key, Table | AoT]: + """ + Parses a table element. + """ + if _header is not None: + indent, is_aot, key = _header + else: + if self._current != "[": + raise self.parse_error( + InternalParserError, + "_parse_table() called on non-bracket character.", + ) + indent, is_aot, key = self._parse_table_header() + if self.end(): raise self.parse_error(UnexpectedEofError) elif self._current != "]": @@ -1045,23 +1063,34 @@ def _parse_table( table.raw_append(_key, _val) else: if self._current == "[": - _, key_next = self._peek_table() + # Parse header tentatively to check for child table + src = self._src + saved = (src._idx, src._current, src._marker) + header = self._parse_table_header() + key_next = header[2] if self._is_child(full_key, key_next): - key_next, table_next = self._parse_table(full_key, table) - + key_next, table_next = self._parse_table( + full_key, table, _header=header + ) table.raw_append(key_next, table_next) # Picking up any sibling while not self.end(): - _, key_next = self._peek_table() + saved = (src._idx, src._current, src._marker) + header = self._parse_table_header() + key_next = header[2] if not self._is_child(full_key, key_next): + src._idx, src._current, src._marker = saved break - key_next, table_next = self._parse_table(full_key, table) - + key_next, table_next = self._parse_table( + full_key, table, _header=header + ) table.raw_append(key_next, table_next) + else: + src._idx, src._current, src._marker = saved break else: @@ -1078,33 +1107,6 @@ def _parse_table( return key, result - def _peek_table(self) -> tuple[bool, Key]: - """ - Peeks ahead non-intrusively by cloning then restoring the - initial state of the parser. - - Returns the name of the table about to be parsed, - as well as whether it is part of an AoT. - """ - # we always want to restore after exiting this scope - with self._state(save_marker=True, restore=True): - if self._current != "[": - raise self.parse_error( - InternalParserError, - "_peek_table() entered on non-bracket character", - ) - - # AoT - self.inc() - is_aot = False - if self._current == "[": - self.inc() - is_aot = True - try: - return is_aot, self._parse_key() - except EmptyKeyError: - raise self.parse_error(EmptyTableNameError) from None - def _parse_aot(self, first: Table, name_first: Key) -> AoT: """ Parses all siblings of the provided table first and bundles them into @@ -1112,13 +1114,18 @@ def _parse_aot(self, first: Table, name_first: Key) -> AoT: """ payload: list[Table] = [first] self._aot_stack.append(name_first) + src = self._src while not self.end(): - is_aot_next, name_next = self._peek_table() + saved = (src._idx, src._current, src._marker) + header = self._parse_table_header() + is_aot_next = header[1] + name_next = header[2] if is_aot_next and name_next == name_first: - _, table = self._parse_table(name_first) + _, table = self._parse_table(name_first, _header=header) assert isinstance(table, Table) payload.append(table) else: + src._idx, src._current, src._marker = saved break self._aot_stack.pop() From f14445cb5338e14aebc103a7ce83ab283c76196a Mon Sep 17 00:00:00 2001 From: David Hotham Date: Sun, 22 Mar 2026 21:14:58 +0000 Subject: [PATCH 6/7] perf: skip formatting checks during parsing in Container.append MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During parsing, Container.append performed expensive formatting work (indentation adjustment, display name invalidation, insertion ordering) that was immediately short-circuited by the _parsed flag — but the isinstance checks guarding those blocks still ran through the slow ABCMeta path every time. Guard all formatting-only logic with 'if not self._parsed:', avoiding ~250k isinstance calls (including ~188k through ABCMeta) and ~38k ends_with_whitespace / _previous_item calls per 500 parses. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tomlkit/container.py | 84 +++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 44 deletions(-) diff --git a/tomlkit/container.py b/tomlkit/container.py index 693c809..c5c12f5 100644 --- a/tomlkit/container.py +++ b/tomlkit/container.py @@ -182,23 +182,23 @@ def append( assert isinstance(key, Key) item.name = key.key - prev = self._previous_item() - prev_ws = isinstance(prev, Whitespace) or ends_with_whitespace(prev) - if isinstance(item, Table): - if not self._parsed: + if not self._parsed: + prev = self._previous_item() + prev_ws = isinstance(prev, Whitespace) or ends_with_whitespace(prev) + if isinstance(item, Table): item.invalidate_display_name() - if ( - self._body - and not (self._parsed or item.trivia.indent or prev_ws) - and key is not None - and not key.is_dotted() - ): - item.trivia.indent = "\n" + if ( + self._body + and not (item.trivia.indent or prev_ws) + and key is not None + and not key.is_dotted() + ): + item.trivia.indent = "\n" - if isinstance(item, AoT) and self._body and not self._parsed: - item.invalidate_display_name() - if item and not ("\n" in item[0].trivia.indent or prev_ws): - item[0].trivia.indent = "\n" + item[0].trivia.indent + if isinstance(item, AoT) and self._body: + item.invalidate_display_name() + if item and not ("\n" in item[0].trivia.indent or prev_ws): + item[0].trivia.indent = "\n" + item[0].trivia.indent if key is not None and key in self._map: current_idx = self._map[key] @@ -289,35 +289,31 @@ def append( else: raise KeyAlreadyPresent(key) - is_table = isinstance(item, (Table, AoT)) - if ( - key is not None - and self._body - and not self._parsed - and (not is_table or key.is_dotted()) - ): - # If there is already at least one table in the current container - # and the given item is not a table, we need to find the last - # item that is not a table and insert after it - # If no such item exists, insert at the top of the table - last_index = self._get_last_index_before_table() - - if last_index < len(self._body): - after_item = self._body[last_index][1] - if not ( - isinstance(after_item, Whitespace) - or "\n" in after_item.trivia.indent - ): - after_item.trivia.indent = "\n" + after_item.trivia.indent - return self._insert_at(last_index, key, item) - else: - previous_item = self._body[-1][1] - if not ( - isinstance(previous_item, Whitespace) - or ends_with_whitespace(previous_item) - or "\n" in previous_item.trivia.trail - ): - previous_item.trivia.trail += "\n" + if not self._parsed: + is_table = isinstance(item, (Table, AoT)) + if key is not None and self._body and (not is_table or key.is_dotted()): + # If there is already at least one table in the current container + # and the given item is not a table, we need to find the last + # item that is not a table and insert after it + # If no such item exists, insert at the top of the table + last_index = self._get_last_index_before_table() + + if last_index < len(self._body): + after_item = self._body[last_index][1] + if not ( + isinstance(after_item, Whitespace) + or "\n" in after_item.trivia.indent + ): + after_item.trivia.indent = "\n" + after_item.trivia.indent + return self._insert_at(last_index, key, item) + else: + previous_item = self._body[-1][1] + if not ( + isinstance(previous_item, Whitespace) + or ends_with_whitespace(previous_item) + or "\n" in previous_item.trivia.trail + ): + previous_item.trivia.trail += "\n" self._raw_append(key, item) return self From 9c27cc281d6da0f92143417f1d6c09da3364ec14 Mon Sep 17 00:00:00 2001 From: David Hotham Date: Sun, 22 Mar 2026 20:31:29 +0000 Subject: [PATCH 7/7] perf: inline Source access in parser hot paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the Parser delegation layer (self._current, self.inc(), self.mark(), self.extract(), self.end()) with direct access to the Source object (src = self._src; src._current, src.inc(), etc.) in all hot methods. This eliminates ~3M Python function calls per 500 parses — one extra frame per property access or method delegation. The delegation wrappers are retained for use by less performance- sensitive code paths. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tomlkit/parser.py | 333 ++++++++++++++++++++++--------------------- tomlkit/toml_char.py | 1 + 2 files changed, 174 insertions(+), 160 deletions(-) diff --git a/tomlkit/parser.py b/tomlkit/parser.py index 762741a..8c53724 100644 --- a/tomlkit/parser.py +++ b/tomlkit/parser.py @@ -140,11 +140,12 @@ def parse_error( def parse(self) -> TOMLDocument: body = TOMLDocument(True) + src = self._src # Take all keyvals outside of tables/AoT's. - while not self.end(): + while src._idx < src._length: # Break out if a table is found - if self._current == "[": + if src._current == "[": break # Otherwise, take and append one KV @@ -158,11 +159,11 @@ def parse(self) -> TOMLDocument: try: body.append(key, value) except Exception as e: - raise self.parse_error(ParseError, str(e)) from e + raise src.parse_error(ParseError, str(e)) from e - self.mark() + src._marker = src._idx - while not self.end(): + while src._idx < src._length: key, value = self._parse_table() if isinstance(value, Table) and value.is_aot_element(): # This is just the first table in an AoT. Parse the rest of the array @@ -172,7 +173,7 @@ def parse(self) -> TOMLDocument: try: body.append(key, value) except Exception as e: - raise self.parse_error(ParseError, str(e)) from e + raise src.parse_error(ParseError, str(e)) from e body.parsing(False) @@ -218,28 +219,29 @@ def _parse_item(self) -> tuple[Key | None, Item] | None: Attempts to parse the next item and returns it, along with its key if the item is value-like. """ - self.mark() - with self._state as state: + src = self._src + src._marker = src._idx + with src._state as state: while True: - c = self._current + c = src._current if c == "\n": # Found a newline; Return all whitespace found up to this point. - self.inc() + src.inc() - return None, Whitespace(self.extract()) + return None, Whitespace(src.extract()) elif c in " \t\r": if c == "\r": - with self._state(restore=True): - if not self.inc() or self._current != "\n": - raise self.parse_error( + with src._state(restore=True): + if not src.inc() or src._current != "\n": + raise src.parse_error( InvalidControlChar, CTRL_M, "documents" ) # Skip whitespace. - if not self.inc(): - return None, Whitespace(self.extract()) + if not src.inc(): + return None, Whitespace(src.extract()) elif c == "#": # Found a comment, parse it - indent = self.extract() + indent = src.extract() cws, comment, trail = self._parse_comment_trail() return None, Comment(Trivia(indent, cws, comment, trail)) @@ -261,98 +263,100 @@ def _parse_comment_trail(self, parse_trail: bool = True) -> tuple[str, str, str] If there is no comment, comment_ws and comment will simply be empty. """ - if self.end(): + src = self._src + if src._idx >= src._length: return "", "", "" comment = "" comment_ws = "" - self.mark() + src._marker = src._idx while True: - c = self._current + c = src._current if c == "\n": break elif c == "#": - comment_ws = self.extract() + comment_ws = src.extract() - self.mark() - self.inc() # Skip # + src._marker = src._idx + src.inc() # Skip # # The comment itself - while not self.end() and self._current not in NL: - code = ord(self._current) + while src._idx < src._length and src._current not in NL: + code = ord(src._current) if code == CHR_DEL or (code <= CTRL_CHAR_LIMIT and code != CTRL_I): - raise self.parse_error(InvalidControlChar, code, "comments") + raise src.parse_error(InvalidControlChar, code, "comments") - if not self.inc(): + if not src.inc(): break - comment = self.extract() - self.mark() + comment = src.extract() + src._marker = src._idx break elif c in " \t\r": if c == "\r": - with self._state(restore=True): - if not self.inc() or self._current != "\n": - raise self.parse_error( + with src._state(restore=True): + if not src.inc() or src._current != "\n": + raise src.parse_error( InvalidControlChar, CTRL_M, "comments" ) - self.inc() + src.inc() else: - raise self.parse_error(UnexpectedCharError, c) + raise src.parse_error(UnexpectedCharError, c) - if self.end(): + if src._idx >= src._length: break trail = "" if parse_trail: - while self._current in SPACES and self.inc(): + while src._current in SPACES and src.inc(): pass - if self._current == "\r": - with self._state(restore=True): - if not self.inc() or self._current != "\n": - raise self.parse_error(InvalidControlChar, CTRL_M, "documents") - self.inc() + if src._current == "\r": + with src._state(restore=True): + if not src.inc() or src._current != "\n": + raise src.parse_error(InvalidControlChar, CTRL_M, "documents") + src.inc() - if self._current == "\n": - self.inc() + if src._current == "\n": + src.inc() - if self._idx != self._marker or self._current in WS: - trail = self.extract() + if src._idx != src._marker or src._current in WS: + trail = src.extract() return comment_ws, comment, trail def _parse_key_value(self, parse_comment: bool = False) -> tuple[Key, Item]: # Leading indent - self.mark() + src = self._src + src._marker = src._idx - while self._current in SPACES and self.inc(): + while src._current in SPACES and src.inc(): pass - indent = self.extract() + indent = src.extract() # Key key = self._parse_key() - self.mark() + src._marker = src._idx - found_equals = self._current == "=" - while self._current in KV and self.inc(): - if self._current == "=": + found_equals = src._current == "=" + while src._current in KV and src.inc(): + if src._current == "=": if found_equals: - raise self.parse_error(UnexpectedCharError, "=") + raise src.parse_error(UnexpectedCharError, "=") else: found_equals = True if not found_equals: - raise self.parse_error(UnexpectedCharError, self._current) + raise src.parse_error(UnexpectedCharError, src._current) if not key.sep: - key.sep = self.extract() + key.sep = src.extract() else: - key.sep += self.extract() + key.sep += src.extract() # Value val = self._parse_value() @@ -377,10 +381,11 @@ def _parse_key(self) -> Key: Parses a Key at the current position; WS before the key must be exhausted first at the callsite. """ - self.mark() - while self._current in SPACES and self.inc(): + src = self._src + src._marker = src._idx + while src._current in SPACES and src.inc(): pass - if self._current in "\"'": + if src._current in "\"'": return self._parse_quoted_key() else: return self._parse_bare_key() @@ -389,9 +394,10 @@ def _parse_quoted_key(self) -> Key: """ Parses a key enclosed in either single or double quotes. """ + src = self._src # Extract the leading whitespace - original = self.extract() - quote_style = self._current + original = src.extract() + quote_style = src._current key_type = next((t for t in KeyType if t.value == quote_style), None) if key_type is None: @@ -401,15 +407,15 @@ def _parse_quoted_key(self) -> Key: StringType.SLB if key_type == KeyType.Basic else StringType.SLL ) if key_str._t.is_multiline(): - raise self.parse_error(UnexpectedCharError, key_str._t.value) + raise src.parse_error(UnexpectedCharError, key_str._t.value) original += key_str.as_string() - self.mark() - while self._current in SPACES and self.inc(): + src._marker = src._idx + while src._current in SPACES and src.inc(): pass - original += self.extract() + original += src.extract() result: Key = SingleKey(str(key_str), t=key_type, sep="", original=original) - if self._current == ".": - self.inc() + if src._current == ".": + src.inc() result = result.concat(self._parse_key()) return result @@ -418,23 +424,24 @@ def _parse_bare_key(self) -> Key: """ Parses a bare key. """ - while (self._current in BARE or self._current in SPACES) and self.inc(): + src = self._src + while (src._current in BARE or src._current in SPACES) and src.inc(): pass - original = self.extract() + original = src.extract() key_s = original.strip() if not key_s: # Empty key - raise self.parse_error(EmptyKeyError) + raise src.parse_error(EmptyKeyError) if " " in key_s: # Bare key with spaces in it - raise self.parse_error(ParseError, f'Invalid key "{key_s}"') + raise src.parse_error(ParseError, f'Invalid key "{key_s}"') result: Key = SingleKey(key_s, KeyType.Bare, "", original) - if self._current == ".": - self.inc() + if src._current == ".": + src.inc() result = result.concat(self._parse_key()) return result @@ -443,8 +450,9 @@ def _parse_value(self) -> Item: """ Attempts to parse a value at the current position. """ - self.mark() - c = self._current + src = self._src + src._marker = src._idx + c = src._current trivia = Trivia() if c == StringType.SLB.value: @@ -468,22 +476,22 @@ def _parse_value(self) -> Item: "nan", }: # Number - while self._current not in " \t\n\r#,]}" and self.inc(): + while src._current not in " \t\n\r#,]}" and src.inc(): pass - raw = self.extract() + raw = src.extract() item = self._parse_number(raw, trivia) if item is not None: return item - raise self.parse_error(InvalidNumberError) + raise src.parse_error(InvalidNumberError) elif c in string.digits: # Integer, Float, Date, Time or DateTime - while self._current not in " \t\n\r#,]}" and self.inc(): + while src._current not in " \t\n\r#,]}" and src.inc(): pass - raw = self.extract() + raw = src.extract() m = RFC_3339_LOOSE.match(raw) if m: @@ -505,18 +513,18 @@ def _parse_value(self) -> Item: raw, ) except ValueError: - raise self.parse_error(InvalidDateTimeError) from None + raise src.parse_error(InvalidDateTimeError) from None if m.group("date"): try: dt = parse_rfc3339(raw) assert isinstance(dt, datetime.date) date = Date(dt.year, dt.month, dt.day, trivia, raw) - self.mark() - while self._current not in "\t\n\r#,]}" and self.inc(): + src._marker = src._idx + while src._current not in "\t\n\r#,]}" and src.inc(): pass - time_raw = self.extract() + time_raw = src.extract() time_part = time_raw.rstrip() trivia.comment_ws = time_raw[len(time_part) :] if not time_part: @@ -537,7 +545,7 @@ def _parse_value(self) -> Item: raw + time_part, ) except ValueError: - raise self.parse_error(InvalidDateError) from None + raise src.parse_error(InvalidDateError) from None if m.group("time"): try: @@ -553,15 +561,15 @@ def _parse_value(self) -> Item: raw, ) except ValueError: - raise self.parse_error(InvalidTimeError) from None + raise src.parse_error(InvalidTimeError) from None item = self._parse_number(raw, trivia) if item is not None: return item - raise self.parse_error(InvalidNumberError) + raise src.parse_error(InvalidNumberError) else: - raise self.parse_error(UnexpectedCharError, c) + raise src.parse_error(UnexpectedCharError, c) def _parse_true(self) -> Bool: return self._parse_bool(BoolType.TRUE) @@ -570,34 +578,36 @@ def _parse_false(self) -> Bool: return self._parse_bool(BoolType.FALSE) def _parse_bool(self, style: BoolType) -> Bool: - with self._state: + src = self._src + with src._state: style = BoolType(style) # only keep parsing for bool if the characters match the style # try consuming rest of chars in style for c in style: - self.consume(c, min=1, max=1) + src.consume(c, min=1, max=1) return Bool(style, Trivia()) def _parse_array(self) -> Array: + src = self._src # Consume opening bracket, EOF here is an issue (middle of array) - self.inc(exception=UnexpectedEofError) + src.inc(exception=UnexpectedEofError) elems: list[Item] = [] prev_value = None while True: # consume whitespace - mark = self._idx - self.consume(SPACES + NL) - indent = self._src[mark : self._idx] + mark = src._idx + src.consume(SPACES + NL) + indent = src[mark : src._idx] newline = set(NL) & set(indent) if newline: elems.append(Whitespace(indent)) continue # consume comment - if self._current == "#": + if src._current == "#": cws, comment, trail = self._parse_comment_trail(parse_trail=False) elems.append(Comment(Trivia(indent, cws, comment, trail))) continue @@ -608,9 +618,9 @@ def _parse_array(self) -> Array: continue # consume closing bracket - if self._current == "]": + if src._current == "]": # consume closing bracket, EOF here doesn't matter - self.inc() + src.inc() break # consume value @@ -623,8 +633,8 @@ def _parse_array(self) -> Array: pass # consume comma - if prev_value and self._current == ",": - self.inc(exception=UnexpectedEofError) + if prev_value and src._current == ",": + src.inc(exception=UnexpectedEofError) # If the previous item is Whitespace, add to it if isinstance(elems[-1], Whitespace): elems[-1]._s = elems[-1].s + "," @@ -633,7 +643,7 @@ def _parse_array(self) -> Array: prev_value = False continue - raise self.parse_error(UnexpectedCharError, self._current) + raise src.parse_error(UnexpectedCharError, src._current) try: res = Array(elems, Trivia()) @@ -642,48 +652,49 @@ def _parse_array(self) -> Array: else: return res - raise self.parse_error(ParseError, "Failed to parse array") + raise src.parse_error(ParseError, "Failed to parse array") def _parse_inline_table(self) -> InlineTable: + src = self._src # consume opening bracket, EOF here is an issue (middle of array) - self.inc(exception=UnexpectedEofError) + src.inc(exception=UnexpectedEofError) elems = Container(True) expect_key = True while True: while True: # consume whitespace and newlines - mark = self._idx - self.consume(SPACES + NL) - raw = self._src[mark : self._idx] + mark = src._idx + src.consume(SPACES + NL) + raw = src[mark : src._idx] if raw: elems.add(Whitespace(raw)) - if self._current != "#": + if src._current != "#": break cws, comment, trail = self._parse_comment_trail(parse_trail=False) elems.add(Comment(Trivia("", cws, comment, trail))) - if self._current == "}": + if src._current == "}": # consume closing bracket, EOF here doesn't matter - self.inc() + src.inc() break if expect_key: - if self._current == ",": - raise self.parse_error(UnexpectedCharError, self._current) + if src._current == ",": + raise src.parse_error(UnexpectedCharError, src._current) key, val = self._parse_key_value(False) elems.add(key, val) expect_key = False continue - if self._current != ",": - raise self.parse_error(UnexpectedCharError, self._current) + if src._current != ",": + raise src.parse_error(UnexpectedCharError, src._current) elems.add(Whitespace(",")) # consume comma, EOF here is an issue (middle of inline table) - self.inc(exception=UnexpectedEofError) + src.inc(exception=UnexpectedEofError) expect_key = True return InlineTable(elems, Trivia()) @@ -744,7 +755,8 @@ def _parse_basic_string(self) -> String: return self._parse_string(StringType.SLB) def _parse_escaped_char(self, multiline: bool) -> str: - if multiline and self._current in WS: + src = self._src + if multiline and src._current in WS: # When the last non-whitespace character on a line is # a \, it will be trimmed along with all whitespace # (including newlines) up to the next non-whitespace @@ -753,77 +765,78 @@ def _parse_escaped_char(self, multiline: bool) -> str: # hello \ # world""" tmp = "" - while self._current in WS: - tmp += self._current + while src._current in WS: + tmp += src._current # consume the whitespace, EOF here is an issue # (middle of string) - self.inc(exception=UnexpectedEofError) + src.inc(exception=UnexpectedEofError) continue # the escape followed by whitespace must have a newline # before any other chars if "\n" not in tmp: - raise self.parse_error(InvalidCharInStringError, self._current) + raise src.parse_error(InvalidCharInStringError, src._current) return "" - if self._current in _escaped: - c = _escaped[self._current] + if src._current in _escaped: + c = _escaped[src._current] # consume this char, EOF here is an issue (middle of string) - self.inc(exception=UnexpectedEofError) + src.inc(exception=UnexpectedEofError) return c - if self._current in {"u", "U"}: + if src._current in {"u", "U"}: # this needs to be a unicode - u, ue = self._peek_unicode(self._current == "U") + u, ue = self._peek_unicode(src._current == "U") if u is not None: assert ue is not None # consume the U char and the unicode value - self.inc_n(len(ue) + 1) + src.inc_n(len(ue) + 1) return u - raise self.parse_error(InvalidUnicodeValueError) + raise src.parse_error(InvalidUnicodeValueError) - if self._current == "x": + if src._current == "x": h, he = self._peek_hex() if h is not None: assert he is not None # consume the x char and the hex value - self.inc_n(len(he) + 1) + src.inc_n(len(he) + 1) return h - raise self.parse_error(InvalidUnicodeValueError) + raise src.parse_error(InvalidUnicodeValueError) - raise self.parse_error(InvalidCharInStringError, self._current) + raise src.parse_error(InvalidCharInStringError, src._current) def _parse_string(self, delim: StringType) -> String: + src = self._src # only keep parsing for string if the current character matches the delim - if self._current != delim.unit: - raise self.parse_error( + if src._current != delim.unit: + raise src.parse_error( InternalParserError, f"Invalid character for string type {delim}", ) # consume the opening/first delim, EOF here is an issue # (middle of string or middle of delim) - self.inc(exception=UnexpectedEofError) + src.inc(exception=UnexpectedEofError) - if self._current == delim.unit: + if src._current == delim.unit: # consume the closing/second delim, we do not care if EOF occurs as # that would simply imply an empty single line string - if not self.inc() or self._current != delim.unit: + if not src.inc() or src._current != delim.unit: # Empty string return String(delim, "", "", Trivia()) # consume the third delim, EOF here is an issue (middle of string) - self.inc(exception=UnexpectedEofError) + src.inc(exception=UnexpectedEofError) delim = delim.toggle() # convert delim to multi delim - self.mark() # to extract the original string with whitespace and all + src._marker = src._idx # to extract the original string with whitespace and all value = "" # Pre-compute delim properties — these are constant through the loop @@ -834,20 +847,20 @@ def _parse_string(self, delim: StringType) -> String: # A newline immediately following the opening delimiter will be trimmed. if delim_is_multiline: - if self._current == "\n": + if src._current == "\n": # consume the newline, EOF here is an issue (middle of string) - self.inc(exception=UnexpectedEofError) + src.inc(exception=UnexpectedEofError) else: - cur: str = self._current - with self._state(restore=True): - if self.inc(): - cur += self._current + cur: str = src._current + with src._state(restore=True): + if src.inc(): + cur += src._current if cur == "\r\n": - self.inc_n(2, exception=UnexpectedEofError) + src.inc_n(2, exception=UnexpectedEofError) escaped = False # whether the previous key was ESCAPE while True: - code = ord(self._current) + code = ord(src._current) if ( delim_is_singleline and not escaped @@ -862,22 +875,22 @@ def _parse_string(self, delim: StringType) -> String: ) ) ): - raise self.parse_error(InvalidControlChar, code, "strings") - elif delim_is_multiline and not escaped and self._current == "\r": - with self._state(restore=True): - if not self.inc() or self._current != "\n": - raise self.parse_error(InvalidControlChar, CTRL_M, "strings") - elif not escaped and self._current == delim_unit: + raise src.parse_error(InvalidControlChar, code, "strings") + elif delim_is_multiline and not escaped and src._current == "\r": + with src._state(restore=True): + if not src.inc() or src._current != "\n": + raise src.parse_error(InvalidControlChar, CTRL_M, "strings") + elif not escaped and src._current == delim_unit: # try to process current as a closing delim - original = self.extract() + original = src.extract() close = "" if delim_is_multiline: # Consume the delimiters to see if we are at the end of the string close = "" - while self._current == delim_unit: - close += self._current - self.inc() + while src._current == delim_unit: + close += src._current + src.inc() if len(close) < 3: # Not a triple quote, leave in result as-is. @@ -890,7 +903,7 @@ def _parse_string(self, delim: StringType) -> String: return String(delim, value, original, Trivia()) if len(close) >= 6: - raise self.parse_error(InvalidCharInStringError, self._current) + raise src.parse_error(InvalidCharInStringError, src._current) value += close[:-3] original += close[:-3] @@ -899,7 +912,7 @@ def _parse_string(self, delim: StringType) -> String: else: # consume the closing delim, we do not care if EOF occurs as # that would simply imply the end of self._src - self.inc() + src.inc() return String(delim, value, original, Trivia()) elif delim_is_basic and escaped: @@ -909,19 +922,19 @@ def _parse_string(self, delim: StringType) -> String: # no longer escaped escaped = False - elif delim_is_basic and self._current == "\\": + elif delim_is_basic and src._current == "\\": # the next char is being escaped escaped = True # consume this char, EOF here is an issue (middle of string) - self.inc(exception=UnexpectedEofError) + src.inc(exception=UnexpectedEofError) else: # this is either a literal string where we keep everything as is, # or this is not a special escaped char in a basic string - value += self._current + value += src._current # consume this char, EOF here is an issue (middle of string) - self.inc(exception=UnexpectedEofError) + src.inc(exception=UnexpectedEofError) def _parse_table_header(self) -> tuple[str, bool, Key]: """ diff --git a/tomlkit/toml_char.py b/tomlkit/toml_char.py index 2381afe..15613d6 100644 --- a/tomlkit/toml_char.py +++ b/tomlkit/toml_char.py @@ -1,5 +1,6 @@ import string + BARE = string.ascii_letters + string.digits + "-_" KV = "= \t" NUMBER = string.digits + "+-_.e"