Skip to content

Commit 9d72e92

Browse files
sideshowbarkerhsivonen
authored andcommitted
Conform tokenizer-only U+0000 NUL handling to spec
This change brings the tokenizer’s handling of U+0000 NUL characters in the DATA state and the CDATA section state into conformance with the requirements in the HTML spec — for the case where only tokenization is being performed, without tree construction; that is, the case where the tokenizer() method is called, rather than parse() or parseFragment(). Specifically, the tokenization steps defined in the spec require that when a U+0000 NUL is consumed in the DATA state or in the CDATA section state, the parser must then emit a U+0000 NUL. But when performing tree construction, the spec requires that when a U+0000 NUL is consumed, the parser must instead emit a U+FFFD REPLACEMENT CHARACTER. Without this change, the parser always emits a U+FFFD REPLACEMENT CHARACTER — even when only tokenization is being performed. That causes us to fail a number of tests in html5lib-tests suite. For more background on the relevant behavior, see the following: * https://www.w3.org/Bugs/Public/show_bug.cgi?id=9659 * whatwg/html@d98f83e * 9b9c263 Relates to #35
1 parent 8bcc5bc commit 9d72e92

File tree

5 files changed

+39
-2
lines changed

5 files changed

+39
-2
lines changed

src/nu/validator/htmlparser/common/TokenHandler.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,17 @@ public void characters(@Const @NoLength char[] buf, int start, int length)
144144
*/
145145
public void zeroOriginatingReplacementCharacter() throws SAXException;
146146

147+
/**
148+
* Emits:
149+
*
150+
* * U+0000 if only tokenization is being performed
151+
* * U+FFFD if tree construction is being performed also
152+
*
153+
* @throws SAXException
154+
* if something went wrong
155+
*/
156+
public void zeroOrReplacementCharacter() throws SAXException;
157+
147158
/**
148159
* The end-of-file token.
149160
*

src/nu/validator/htmlparser/impl/Tokenizer.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1578,7 +1578,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
15781578
break dataloop; // FALL THROUGH continue
15791579
// stateloop;
15801580
case '\u0000':
1581-
emitReplacementCharacter(buf, pos);
1581+
maybeEmitReplacementCharacter(buf, pos);
15821582
continue;
15831583
case '\r':
15841584
emitCarriageReturn(buf, pos);
@@ -3081,7 +3081,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
30813081
state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos);
30823082
break cdatasectionloop; // FALL THROUGH
30833083
case '\u0000':
3084-
emitReplacementCharacter(buf, pos);
3084+
maybeEmitReplacementCharacter(buf, pos);
30853085
continue;
30863086
case '\r':
30873087
emitCarriageReturn(buf, pos);
@@ -6177,6 +6177,13 @@ private void emitReplacementCharacter(@NoLength char[] buf, int pos)
61776177
cstart = pos + 1;
61786178
}
61796179

6180+
private void maybeEmitReplacementCharacter(@NoLength char[] buf, int pos)
6181+
throws SAXException {
6182+
flushChars(buf, pos);
6183+
tokenHandler.zeroOrReplacementCharacter();
6184+
cstart = pos + 1;
6185+
}
6186+
61806187
private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos)
61816188
throws SAXException {
61826189
flushChars(buf, pos);

src/nu/validator/htmlparser/impl/TreeBuilder.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1241,6 +1241,13 @@ public void zeroOriginatingReplacementCharacter() throws SAXException {
12411241
}
12421242
}
12431243

1244+
/**
1245+
* @see nu.validator.htmlparser.common.TokenHandler#zeroOrReplacementCharacter()
1246+
*/
1247+
public void zeroOrReplacementCharacter() throws SAXException {
1248+
zeroOriginatingReplacementCharacter();
1249+
}
1250+
12441251
public final void eof() throws SAXException {
12451252
flushCharacters();
12461253
// Note: Can't attach error messages to EOF in C++ yet

test-src/nu/validator/htmlparser/test/JSONArrayTokenHandler.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ public class JSONArrayTokenHandler implements TokenHandler, ErrorHandler {
5454

5555
private static final char[] REPLACEMENT_CHARACTER = { '\uFFFD' };
5656

57+
private static final char[] NULL = { '\u0000' };
58+
5759
private final StringBuilder builder = new StringBuilder();
5860

5961
private JSONArray array = null;
@@ -174,6 +176,11 @@ public void endTokenization() throws SAXException {
174176
builder.append(REPLACEMENT_CHARACTER, 0, 1);
175177
}
176178

179+
@Override public void zeroOrReplacementCharacter()
180+
throws SAXException {
181+
builder.append(NULL, 0, 1);
182+
}
183+
177184
@Override public boolean cdataSectionAllowed() throws SAXException {
178185
return false;
179186
}

test-src/nu/validator/htmlparser/test/TokenPrinter.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,11 @@ public void endTokenization() throws SAXException {
200200
}
201201
}
202202

203+
@Override public void zeroOrReplacementCharacter()
204+
throws SAXException {
205+
zeroOriginatingReplacementCharacter();
206+
}
207+
203208
@Override public boolean cdataSectionAllowed() throws SAXException {
204209
return false;
205210
}

0 commit comments

Comments
 (0)