diff --git a/parser/lexer/lexer.go b/parser/lexer/lexer.go index 5daf9503..0e75942d 100644 --- a/parser/lexer/lexer.go +++ b/parser/lexer/lexer.go @@ -211,6 +211,37 @@ func (l *Lexer) scanEscape(quote rune) rune { case 'x': ch = l.scanDigits(l.next(), 16, 2) case 'u': + // Support variable-length form: \u{XXXXXX} + if l.peek() == '{' { + // consume '{' + l.next() + // read 1-6 hex digits + digits := 0 + for { + p := l.peek() + if p == '}' { + break + } + if digitVal(p) >= 16 { + l.error("invalid char escape") + return eof + } + if digits >= 6 { + l.error("invalid char escape") + return eof + } + l.next() + digits++ + } + if l.peek() != '}' || digits == 0 { + l.error("invalid char escape") + return eof + } + // consume '}' and continue + l.next() + ch = l.next() + break + } ch = l.scanDigits(l.next(), 16, 4) case 'U': ch = l.scanDigits(l.next(), 16, 8) diff --git a/parser/lexer/lexer_test.go b/parser/lexer/lexer_test.go index baa5aabb..f46870e9 100644 --- a/parser/lexer/lexer_test.go +++ b/parser/lexer/lexer_test.go @@ -291,6 +291,14 @@ func TestLex(t *testing.T) { {Kind: EOF}, }, }, + { + "\"\\u{61}\\u{1F600}\" '\\u{61}\\u{1F600}'", + []Token{ + {Kind: String, Value: "a😀"}, + {Kind: String, Value: "a😀"}, + {Kind: EOF}, + }, + }, } for _, test := range tests { diff --git a/parser/lexer/utils.go b/parser/lexer/utils.go index fdb8beaa..6aa088ae 100644 --- a/parser/lexer/utils.go +++ b/parser/lexer/utils.go @@ -111,6 +111,41 @@ func unescapeChar(s string) (value rune, multibyte bool, tail string, err error) // 4. Unicode escape sequences, reproduced from `strconv/quote.go` case 'x', 'X', 'u', 'U': + // Support Go/Rust-style variable-length form: \u{XXXXXX} + if c == 'u' && len(s) > 0 && s[0] == '{' { + // consume '{' + s = s[1:] + var v rune + digits := 0 + for len(s) > 0 && s[0] != '}' { + x, ok := unhex(s[0]) + if !ok { + err = fmt.Errorf("unable to unescape string") + return + } + if digits >= 6 { // at most 6 hex digits + err = fmt.Errorf("unable to unescape string") + return + } + v = v<<4 | x + s = s[1:] + digits++ + } + // require closing '}' and at least 1 digit + if len(s) == 0 || s[0] != '}' || digits == 0 { + err = fmt.Errorf("unable to unescape string") + return + } + // consume '}' + s = s[1:] + if v > utf8.MaxRune { + err = fmt.Errorf("unable to unescape string") + return + } + value = v + multibyte = true + break + } n := 0 switch c { case 'x', 'X':