Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from tomlkit.exceptions import EmptyTableNameError
from tomlkit.exceptions import InternalParserError
from tomlkit.exceptions import InvalidUnicodeValueError
from tomlkit.exceptions import UnexpectedCharError
from tomlkit.items import StringType
from tomlkit.parser import Parser
Expand Down Expand Up @@ -63,3 +64,19 @@ def test_parse_multiline_literal_string_with_crlf() -> None:
content = "a = '''foo\r\nbar'''"
parser = Parser(content)
assert parser.parse() == {"a": "foo\r\nbar"}


@pytest.mark.parametrize(
"content",
[
r'a = "\uD800"',
r'a = "\uDFFF"',
r'a = "\U0000D800"',
r'a = "\U0000DFFF"',
r'a = "\U0000DC00"',
],
)
def test_parser_rejects_surrogate_unicode_escapes(content: str) -> None:
parser = Parser(content)
with pytest.raises(InvalidUnicodeValueError):
parser.parse()
14 changes: 11 additions & 3 deletions tomlkit/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1165,11 +1165,19 @@ def _peek_unicode(self, is_long: bool) -> tuple[str | None, str | None]:
else:
extracted = self.extract()

if extracted[0].lower() == "d" and extracted[1].strip("01234567"):
return None, None
try:
codepoint = int(extracted, 16)
except ValueError:
return None, extracted

# Unicode scalar values exclude the surrogate range
# (U+D800 to U+DFFF). The 8-digit \U form reaches this range
# with leading zeros, so it must be checked on the value itself.
if 0xD800 <= codepoint <= 0xDFFF:
return None, extracted

try:
value = chr(int(extracted, 16))
value = chr(codepoint)
except (ValueError, OverflowError):
value = None

Expand Down
Loading