diff --git a/CHANGELOG.md b/CHANGELOG.md index fa39e4f..a4b6852 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Changed - Speed up membership tests (`key in ...`) on `Container`, `Table` and `InlineTable` with native `__contains__` implementations, avoiding the inherited `MutableMapping` round-trip through `__getitem__` (which resolves the value and builds an exception on every absent key). ([#483](https://github.com/python-poetry/tomlkit/issues/483)) +- Speed up parsing by interning `TOMLChar` instances: the single-character strings read while parsing draw on a tiny alphabet, so caching one instance per character avoids reconstructing them on every read. ([#488](https://github.com/python-poetry/tomlkit/pull/488)) ### Fixed diff --git a/tomlkit/toml_char.py b/tomlkit/toml_char.py index 970cbd4..6936098 100644 --- a/tomlkit/toml_char.py +++ b/tomlkit/toml_char.py @@ -1,12 +1,30 @@ import string -class TOMLChar(str): - def __init__(self, c: str) -> None: - super().__init__() +# Intern TOMLChar instances. A document is read one character at a time and +# draws on a tiny alphabet, so the same single-character strings get built over +# and over; caching one instance per character turns each repeat into a dict +# lookup. The cache is bounded by the alphabet seen (a few hundred entries at +# most for typical input). +_TOML_CHAR_CACHE: dict[str, "TOMLChar"] = {} + - if len(self) > 1: +class TOMLChar(str): + def __new__(cls, c: str) -> "TOMLChar": + cached = _TOML_CHAR_CACHE.get(c) + if cached is not None: + return cached + if len(c) > 1: raise ValueError("A TOML character must be of length 1") + instance = super().__new__(cls, c) + # Never intern the NUL character: Source uses TOMLChar("\0") as its + # end-of-input sentinel and detects EOF by identity (`current is EOF`). + # Caching "\0" would make a real U+0000 in the input share that identity, + # so the parser would treat an embedded NUL as end-of-file instead of + # rejecting it. Leaving it un-interned keeps EOF a unique sentinel. + if c != "\0": + _TOML_CHAR_CACHE[c] = instance + return instance BARE = string.ascii_letters + string.digits + "-_" KV = "= \t"