1010import java .io .*;
1111
1212 /**
13- * A general-purpose character input for reading text from byte streams and
14- * text strings. When reading byte streams, this class supports the
15- * UTF-8 character encoding by default, but can be configured to support
16- * UTF-16 and UTF-32 as well.
13+ * A general-purpose character input for reading Unicode text from byte streams
14+ * and text strings. It supports UTF-8 by default, but can be configured
15+ * to support UTF-16 and UTF-32 as well.
1716 */
1817 final class CharacterReader implements ICharacterInput {
1918 private final int mode ;
@@ -48,21 +47,10 @@ public CharacterReader (String str, boolean skipByteOrderMark, boolean
4847 this .stream = null ;
4948 }
5049
51- /**
52- * Initializes a new instance of the CharacterReader class, reading UTF-8 text
53- * that can start with an optional byte order mark (U + FEFF), and where
54- * invalid UTF-8 is replaced with replacement characters.
55- */
5650 public CharacterReader (InputStream stream ) {
5751 this (stream , 0 , false );
5852 }
5953
60- /**
61- * Initializes a new instance of the CharacterReader class.
62- * @param stream A readable byte stream. If the stream is detected as UTF-8,
63- * will skip the first code point if that code point is a byte order
64- * mark (U + FEFF).
65- */
6654 public CharacterReader (InputStream stream , int mode , boolean errorThrow ) {
6755 this (stream , mode , errorThrow , false );
6856 }
@@ -71,7 +59,12 @@ public CharacterReader (InputStream stream, int mode) {
7159 }
7260
7361 /**
74- *
62+ * Initializes a new instance of the CharacterReader class.
63+ * @param stream Not documented yet.
64+ * @param mode Not documented yet.
65+ * @param errorThrow Not documented yet. (3).
66+ * @param dontSkipUtf8Bom Not documented yet. (4).
67+ * @throws NullPointerException The parameter {@code stream} is null.
7568 */
7669 public CharacterReader (InputStream stream , int mode , boolean errorThrow ,
7770 boolean dontSkipUtf8Bom ) {
@@ -190,15 +183,18 @@ private int DetectUtf8Or16Or32(int c1) {
190183 return 0xfffd ;
191184 }
192185 } else if (c1 == 0 && mode == 4 ) {
193- // Here, the relevant case is:
186+ // Here, the relevant cases are:
187+ // 0 0 0 NZA --> UTF-32BE (if mode is 4)
194188 // 0 0 FE FF --> UTF-32BE
195189 // Anything else is treated as UTF-8
196190 c2 = this .stream .read ();
197191 c3 = this .stream .read ();
198192 c4 = this .stream .read ();
199- if (c2 == 0 && c3 == 0xfe && c4 == 0xff ) {
193+ if (c2 == 0 &&
194+ ((c3 == 0xfe && c4 == 0xff ) ||
195+ (c3 == 0 && c4 >= 0x01 && c4 <= 0x7f ))) {
200196 this .reader = new Utf32Reader (this .stream , true , errorThrow );
201- return this .reader .ReadChar ();
197+ return c3 == 0 ? c4 : this .reader .ReadChar ();
202198 } else {
203199 Utf8Reader utf8reader = new Utf8Reader (this .stream , errorThrow );
204200 utf8reader .UngetThree (c2 , c3 , c4 );
@@ -338,7 +334,7 @@ private int DetectUnicodeEncoding() {
338334 utf8reader = new Utf8Reader (this .stream , errorThrow );
339335 this .reader = utf8reader ;
340336 c1 = utf8reader .ReadChar ();
341- if (c1 == 0xfeff && ! dontSkipUtf8Bom ) {
337+ if (c1 == 0xfeff ) {
342338 // Skip BOM
343339 c1 = utf8reader .ReadChar ();
344340 }
0 commit comments