@@ -139,13 +139,31 @@ macro_rules! configure_methods {
139139macro_rules! read_event_impl {
140140 (
141141 $self: ident, $buf: ident,
142+ $reader: expr,
142143 $read_until_open: ident,
143144 $read_until_close: ident
144145 $( , $await: ident) ?
145146 ) => { {
146147 let event = match $self. parser. state {
147- ParseState :: Init => $self. $read_until_open( $buf, true ) $( . $await) ?,
148- ParseState :: ClosedTag => $self. $read_until_open( $buf, false ) $( . $await) ?,
148+ ParseState :: Init => {
149+ // If encoding set explicitly, we not need to detect it. For example,
150+ // explicit UTF-8 set automatically if Reader was created using `from_str`.
151+ // But we still need to remove BOM for consistency with no encoding
152+ // feature enabled path
153+ #[ cfg( feature = "encoding" ) ]
154+ if let Some ( encoding) = $reader. detect_encoding( ) $( . $await) ? ? {
155+ if $self. parser. encoding. can_be_refined( ) {
156+ $self. parser. encoding = crate :: reader:: EncodingRef :: BomDetected ( encoding) ;
157+ }
158+ }
159+
160+ // Removes UTF-8 BOM if it is present
161+ #[ cfg( not( feature = "encoding" ) ) ]
162+ $reader. remove_utf8_bom( ) $( . $await) ? ?;
163+
164+ $self. $read_until_open( $buf) $( . $await) ?
165+ } ,
166+ ParseState :: ClosedTag => $self. $read_until_open( $buf) $( . $await) ?,
149167 ParseState :: OpenedTag => $self. $read_until_close( $buf) $( . $await) ?,
150168 ParseState :: Empty => $self. parser. close_expanded_empty( ) ,
151169 ParseState :: Exit => return Ok ( Event :: Eof ) ,
@@ -160,7 +178,7 @@ macro_rules! read_event_impl {
160178
161179macro_rules! read_until_open {
162180 (
163- $self: ident, $buf: ident, $first : ident ,
181+ $self: ident, $buf: ident,
164182 $reader: expr,
165183 $read_event: ident
166184 $( , $await: ident) ?
@@ -180,7 +198,7 @@ macro_rules! read_until_open {
180198 . read_bytes_until( b'<' , $buf, & mut $self. parser. offset)
181199 $( . $await) ?
182200 {
183- Ok ( Some ( bytes) ) => $self. parser. read_text( bytes, $first ) ,
201+ Ok ( Some ( bytes) ) => $self. parser. read_text( bytes) ,
184202 Ok ( None ) => Ok ( Event :: Eof ) ,
185203 Err ( e) => Err ( e) ,
186204 }
@@ -557,15 +575,15 @@ impl<R> Reader<R> {
557575 where
558576 R : XmlSource < ' i , B > ,
559577 {
560- read_event_impl ! ( self , buf, read_until_open, read_until_close)
578+ read_event_impl ! ( self , buf, self . reader , read_until_open, read_until_close)
561579 }
562580
563581 /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
564- fn read_until_open < ' i , B > ( & mut self , buf : B , first : bool ) -> Result < Event < ' i > >
582+ fn read_until_open < ' i , B > ( & mut self , buf : B ) -> Result < Event < ' i > >
565583 where
566584 R : XmlSource < ' i , B > ,
567585 {
568- read_until_open ! ( self , buf, first , self . reader, read_event_impl)
586+ read_until_open ! ( self , buf, self . reader, read_event_impl)
569587 }
570588
571589 /// Private function to read until `>` is found. This function expects that
@@ -595,6 +613,14 @@ impl<R> Reader<R> {
595613/// - `B`: a type of a buffer that can be used to store data read from `Self` and
596614/// from which events can borrow
597615trait XmlSource < ' r , B > {
616+ /// Removes UTF-8 BOM if it is present
617+ #[ cfg( not( feature = "encoding" ) ) ]
618+ fn remove_utf8_bom ( & mut self ) -> Result < ( ) > ;
619+
620+ /// Determines encoding from the start of input and removes BOM if it is present
621+ #[ cfg( feature = "encoding" ) ]
622+ fn detect_encoding ( & mut self ) -> Result < Option < & ' static Encoding > > ;
623+
598624 /// Read input until `byte` is found or end of input is reached.
599625 ///
600626 /// Returns a slice of data read up to `byte`, which does not include into result.
@@ -1579,10 +1605,39 @@ mod test {
15791605 use crate :: reader:: Reader ;
15801606 use pretty_assertions:: assert_eq;
15811607
1608+ /// When `encoding` feature is enabled, encoding should be detected
1609+ /// from BOM (UTF-8) and BOM should be stripped.
1610+ ///
1611+ /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
1612+ /// character should be stripped for consistency
1613+ #[ $test]
1614+ $( $async) ? fn bom_from_reader( ) {
1615+ let mut reader = Reader :: from_reader( "\u{feff} \u{feff} " . as_bytes( ) ) ;
1616+
1617+ assert_eq!(
1618+ reader. $read_event( $buf) $( . $await) ? . unwrap( ) ,
1619+ Event :: Text ( BytesText :: from_escaped( "\u{feff} " ) )
1620+ ) ;
1621+
1622+ assert_eq!(
1623+ reader. $read_event( $buf) $( . $await) ? . unwrap( ) ,
1624+ Event :: Eof
1625+ ) ;
1626+ }
1627+
1628+ /// When parsing from &str, encoding is fixed (UTF-8), so
1629+ /// - when `encoding` feature is disabled, the behavior the
1630+ /// same as in `bom_from_reader` text
1631+ /// - when `encoding` feature is enabled, the behavior should
1632+ /// stay consistent, so the first BOM character is stripped
15821633 #[ $test]
1583- #[ should_panic] // Failure is expected until read_until_open() is smart enough to skip over irrelevant text events.
1584- $( $async) ? fn bom_at_start( ) {
1585- let mut reader = Reader :: from_str( "\u{feff} " ) ;
1634+ $( $async) ? fn bom_from_str( ) {
1635+ let mut reader = Reader :: from_str( "\u{feff} \u{feff} " ) ;
1636+
1637+ assert_eq!(
1638+ reader. $read_event( $buf) $( . $await) ? . unwrap( ) ,
1639+ Event :: Text ( BytesText :: from_escaped( "\u{feff} " ) )
1640+ ) ;
15861641
15871642 assert_eq!(
15881643 reader. $read_event( $buf) $( . $await) ? . unwrap( ) ,
0 commit comments