1010import java .io .*;
1111
1212 /**
13- * A general-purpose character input for reading Unicode text from byte streams
14- * and text strings. It supports UTF-8 by default, but can be configured
15- * to support UTF-16 and UTF-32 as well.
13+ * A general-purpose character input for reading text from byte streams and
14+ * text strings. When reading byte streams, this class supports the
15+ * UTF-8 character encoding by default, but can be configured to support
16+ * UTF-16 and UTF-32 as well.
1617 */
1718 final class CharacterReader implements ICharacterInput {
1819 private final int mode ;
@@ -47,10 +48,21 @@ public CharacterReader (String str, boolean skipByteOrderMark, boolean
4748 this .stream = null ;
4849 }
4950
51+ /**
52+ * Initializes a new instance of the CharacterReader class, reading UTF-8 text
53+ * that can start with an optional byte order mark (U + FEFF), and where
54+ * invalid UTF-8 is replaced with replacement characters.
55+ */
5056 public CharacterReader (InputStream stream ) {
5157 this (stream , 0 , false );
5258 }
5359
60+ /**
61+ * Initializes a new instance of the CharacterReader class.
62+ * @param stream A readable byte stream. If the stream is detected as UTF-8,
63+ * will skip the first code point if that code point is a byte order
64+ * mark (U + FEFF).
65+ */
5466 public CharacterReader (InputStream stream , int mode , boolean errorThrow ) {
5567 this (stream , mode , errorThrow , false );
5668 }
@@ -59,12 +71,7 @@ public CharacterReader (InputStream stream, int mode) {
5971 }
6072
6173 /**
62- * Initializes a new instance of the CharacterReader class.
63- * @param stream Not documented yet.
64- * @param mode Not documented yet.
65- * @param errorThrow Not documented yet. (3).
66- * @param dontSkipUtf8Bom Not documented yet. (4).
67- * @throws NullPointerException The parameter {@code stream} is null.
74+ *
6875 */
6976 public CharacterReader (InputStream stream , int mode , boolean errorThrow ,
7077 boolean dontSkipUtf8Bom ) {
@@ -183,18 +190,15 @@ private int DetectUtf8Or16Or32(int c1) {
183190 return 0xfffd ;
184191 }
185192 } else if (c1 == 0 && mode == 4 ) {
186- // Here, the relevant cases are:
187- // 0 0 0 NZA --> UTF-32BE (if mode is 4)
193+ // Here, the relevant case is:
188194 // 0 0 FE FF --> UTF-32BE
189195 // Anything else is treated as UTF-8
190196 c2 = this .stream .read ();
191197 c3 = this .stream .read ();
192198 c4 = this .stream .read ();
193- if (c2 == 0 &&
194- ((c3 == 0xfe && c4 == 0xff ) ||
195- (c3 == 0 && c4 >= 0x01 && c4 <= 0x7f ))) {
199+ if (c2 == 0 && c3 == 0xfe && c4 == 0xff ) {
196200 this .reader = new Utf32Reader (this .stream , true , errorThrow );
197- return c3 == 0 ? c4 : this .reader .ReadChar ();
201+ return this .reader .ReadChar ();
198202 } else {
199203 Utf8Reader utf8reader = new Utf8Reader (this .stream , errorThrow );
200204 utf8reader .UngetThree (c2 , c3 , c4 );
@@ -334,7 +338,7 @@ private int DetectUnicodeEncoding() {
334338 utf8reader = new Utf8Reader (this .stream , errorThrow );
335339 this .reader = utf8reader ;
336340 c1 = utf8reader .ReadChar ();
337- if (c1 == 0xfeff ) {
341+ if (c1 == 0xfeff && ! dontSkipUtf8Bom ) {
338342 // Skip BOM
339343 c1 = utf8reader .ReadChar ();
340344 }
0 commit comments