From ce66d207dc905efc677f264a70f65ea9a68e5780 Mon Sep 17 00:00:00 2001 From: Thomas Foutrein Date: Fri, 5 Jun 2026 14:30:26 +0200 Subject: [PATCH] Speed up parsing by interning TOMLChar instances The parser reads a document one character at a time, wrapping each in a `TOMLChar`. Since TOML draws on a tiny alphabet, the same single-character strings are reconstructed over and over. Cache one `TOMLChar` per distinct character in a module-level dict and return the cached instance, turning every repeat construction into a dict lookup. The NUL character is deliberately left un-interned: `Source` uses `TOMLChar("\0")` as its end-of-input sentinel and detects EOF by identity, so caching it would make a real U+0000 in the input alias the sentinel and be mistaken for end-of-file. No behaviour change (full test suite incl. the toml-test conformance cases passes); ~1.1x faster parsing across a range of document sizes/shapes. --- CHANGELOG.md | 1 + tomlkit/toml_char.py | 26 ++++++++++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fa39e4f..a4b6852 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Changed - Speed up membership tests (`key in ...`) on `Container`, `Table` and `InlineTable` with native `__contains__` implementations, avoiding the inherited `MutableMapping` round-trip through `__getitem__` (which resolves the value and builds an exception on every absent key). ([#483](https://github.com/python-poetry/tomlkit/issues/483)) +- Speed up parsing by interning `TOMLChar` instances: the single-character strings read while parsing draw on a tiny alphabet, so caching one instance per character avoids reconstructing them on every read. ([#488](https://github.com/python-poetry/tomlkit/pull/488)) ### Fixed diff --git a/tomlkit/toml_char.py b/tomlkit/toml_char.py index 970cbd4..6936098 100644 --- a/tomlkit/toml_char.py +++ b/tomlkit/toml_char.py @@ -1,12 +1,30 @@ import string -class TOMLChar(str): - def __init__(self, c: str) -> None: - super().__init__() +# Intern TOMLChar instances. A document is read one character at a time and +# draws on a tiny alphabet, so the same single-character strings get built over +# and over; caching one instance per character turns each repeat into a dict +# lookup. The cache is bounded by the alphabet seen (a few hundred entries at +# most for typical input). +_TOML_CHAR_CACHE: dict[str, "TOMLChar"] = {} + - if len(self) > 1: +class TOMLChar(str): + def __new__(cls, c: str) -> "TOMLChar": + cached = _TOML_CHAR_CACHE.get(c) + if cached is not None: + return cached + if len(c) > 1: raise ValueError("A TOML character must be of length 1") + instance = super().__new__(cls, c) + # Never intern the NUL character: Source uses TOMLChar("\0") as its + # end-of-input sentinel and detects EOF by identity (`current is EOF`). + # Caching "\0" would make a real U+0000 in the input share that identity, + # so the parser would treat an embedded NUL as end-of-file instead of + # rejecting it. Leaving it un-interned keeps EOF a unique sentinel. + if c != "\0": + _TOML_CHAR_CACHE[c] = instance + return instance BARE = string.ascii_letters + string.digits + "-_" KV = "= \t"