Skip to content

Commit 713a26a

Browse files
authored
Add support for Unicode escape sequences in the \u{XXXXXX} format (#882)
1 parent 6ed72a2 commit 713a26a

File tree

3 files changed

+74
-0
lines changed

3 files changed

+74
-0
lines changed

parser/lexer/lexer.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,37 @@ func (l *Lexer) scanEscape(quote rune) rune {
211211
case 'x':
212212
ch = l.scanDigits(l.next(), 16, 2)
213213
case 'u':
214+
// Support variable-length form: \u{XXXXXX}
215+
if l.peek() == '{' {
216+
// consume '{'
217+
l.next()
218+
// read 1-6 hex digits
219+
digits := 0
220+
for {
221+
p := l.peek()
222+
if p == '}' {
223+
break
224+
}
225+
if digitVal(p) >= 16 {
226+
l.error("invalid char escape")
227+
return eof
228+
}
229+
if digits >= 6 {
230+
l.error("invalid char escape")
231+
return eof
232+
}
233+
l.next()
234+
digits++
235+
}
236+
if l.peek() != '}' || digits == 0 {
237+
l.error("invalid char escape")
238+
return eof
239+
}
240+
// consume '}' and continue
241+
l.next()
242+
ch = l.next()
243+
break
244+
}
214245
ch = l.scanDigits(l.next(), 16, 4)
215246
case 'U':
216247
ch = l.scanDigits(l.next(), 16, 8)

parser/lexer/lexer_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,14 @@ func TestLex(t *testing.T) {
291291
{Kind: EOF},
292292
},
293293
},
294+
{
295+
"\"\\u{61}\\u{1F600}\" '\\u{61}\\u{1F600}'",
296+
[]Token{
297+
{Kind: String, Value: "a😀"},
298+
{Kind: String, Value: "a😀"},
299+
{Kind: EOF},
300+
},
301+
},
294302
}
295303

296304
for _, test := range tests {

parser/lexer/utils.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,41 @@ func unescapeChar(s string) (value rune, multibyte bool, tail string, err error)
111111

112112
// 4. Unicode escape sequences, reproduced from `strconv/quote.go`
113113
case 'x', 'X', 'u', 'U':
114+
// Support Go/Rust-style variable-length form: \u{XXXXXX}
115+
if c == 'u' && len(s) > 0 && s[0] == '{' {
116+
// consume '{'
117+
s = s[1:]
118+
var v rune
119+
digits := 0
120+
for len(s) > 0 && s[0] != '}' {
121+
x, ok := unhex(s[0])
122+
if !ok {
123+
err = fmt.Errorf("unable to unescape string")
124+
return
125+
}
126+
if digits >= 6 { // at most 6 hex digits
127+
err = fmt.Errorf("unable to unescape string")
128+
return
129+
}
130+
v = v<<4 | x
131+
s = s[1:]
132+
digits++
133+
}
134+
// require closing '}' and at least 1 digit
135+
if len(s) == 0 || s[0] != '}' || digits == 0 {
136+
err = fmt.Errorf("unable to unescape string")
137+
return
138+
}
139+
// consume '}'
140+
s = s[1:]
141+
if v > utf8.MaxRune {
142+
err = fmt.Errorf("unable to unescape string")
143+
return
144+
}
145+
value = v
146+
multibyte = true
147+
break
148+
}
114149
n := 0
115150
switch c {
116151
case 'x', 'X':

0 commit comments

Comments
 (0)