From ce66d207dc905efc677f264a70f65ea9a68e5780 Mon Sep 17 00:00:00 2001
From: Thomas Foutrein <thomas.foutrein@gmail.com>
Date: Fri, 5 Jun 2026 14:30:26 +0200
Subject: [PATCH] Speed up parsing by interning TOMLChar instances

The parser reads a document one character at a time, wrapping each in a
`TOMLChar`. Since TOML draws on a tiny alphabet, the same single-character
strings are reconstructed over and over.

Cache one `TOMLChar` per distinct character in a module-level dict and
return the cached instance, turning every repeat construction into a dict
lookup. The NUL character is deliberately left un-interned: `Source` uses
`TOMLChar("\0")` as its end-of-input sentinel and detects EOF by identity,
so caching it would make a real U+0000 in the input alias the sentinel and
be mistaken for end-of-file.

No behaviour change (full test suite incl. the toml-test conformance cases
passes); ~1.1x faster parsing across a range of document sizes/shapes.
---
 CHANGELOG.md         |  1 +
 tomlkit/toml_char.py | 26 ++++++++++++++++++++++----
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fa39e4f..a4b6852 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@
 ### Changed
 
 - Speed up membership tests (`key in ...`) on `Container`, `Table` and `InlineTable` with native `__contains__` implementations, avoiding the inherited `MutableMapping` round-trip through `__getitem__` (which resolves the value and builds an exception on every absent key). ([#483](https://github.com/python-poetry/tomlkit/issues/483))
+- Speed up parsing by interning `TOMLChar` instances: the single-character strings read while parsing draw on a tiny alphabet, so caching one instance per character avoids reconstructing them on every read. ([#488](https://github.com/python-poetry/tomlkit/pull/488))
 
 ### Fixed
 
diff --git a/tomlkit/toml_char.py b/tomlkit/toml_char.py
index 970cbd4..6936098 100644
--- a/tomlkit/toml_char.py
+++ b/tomlkit/toml_char.py
@@ -1,12 +1,30 @@
 import string
 
 
-class TOMLChar(str):
-    def __init__(self, c: str) -> None:
-        super().__init__()
+# Intern TOMLChar instances. A document is read one character at a time and
+# draws on a tiny alphabet, so the same single-character strings get built over
+# and over; caching one instance per character turns each repeat into a dict
+# lookup. The cache is bounded by the alphabet seen (a few hundred entries at
+# most for typical input).
+_TOML_CHAR_CACHE: dict[str, "TOMLChar"] = {}
+
 
-        if len(self) > 1:
+class TOMLChar(str):
+    def __new__(cls, c: str) -> "TOMLChar":
+        cached = _TOML_CHAR_CACHE.get(c)
+        if cached is not None:
+            return cached
+        if len(c) > 1:
             raise ValueError("A TOML character must be of length 1")
+        instance = super().__new__(cls, c)
+        # Never intern the NUL character: Source uses TOMLChar("\0") as its
+        # end-of-input sentinel and detects EOF by identity (`current is EOF`).
+        # Caching "\0" would make a real U+0000 in the input share that identity,
+        # so the parser would treat an embedded NUL as end-of-file instead of
+        # rejecting it. Leaving it un-interned keeps EOF a unique sentinel.
+        if c != "\0":
+            _TOML_CHAR_CACHE[c] = instance
+        return instance
 
     BARE = string.ascii_letters + string.digits + "-_"
     KV = "= \t"