Skip to content

Commit 60dc37f

Browse files
Mingundralley
authored andcommitted
Correctly detect UTF-16 encoding even without BOM
Fixes 2 tests
1 parent 7f34520 commit 60dc37f

File tree

7 files changed

+149
-75
lines changed

7 files changed

+149
-75
lines changed

src/encoding.rs

Lines changed: 15 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -99,29 +99,14 @@ pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b
9999
.ok_or(Error::NonDecodable(None))
100100
}
101101

102-
#[cfg(feature = "encoding")]
103-
fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) {
104-
if encoding == UTF_8 && bytes.starts_with(UTF8_BOM) {
105-
bytes.split_at(3)
106-
} else if encoding == UTF_16LE && bytes.starts_with(UTF16_LE_BOM) {
107-
bytes.split_at(2)
108-
} else if encoding == UTF_16BE && bytes.starts_with(UTF16_BE_BOM) {
109-
bytes.split_at(2)
110-
} else {
111-
(&[], bytes)
112-
}
113-
}
114-
115-
#[cfg(feature = "encoding")]
116-
pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
117-
let (_, bytes) = split_at_bom(bytes, encoding);
118-
bytes
119-
}
120-
121102
/// Automatic encoding detection of XML files based using the
122103
/// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing).
123104
///
124-
/// If encoding is detected, `Some` is returned, otherwise `None` is returned.
105+
/// If encoding is detected, `Some` is returned with an encoding and size of BOM
106+
/// in bytes, if detection was performed using BOM, or zero, if detection was
107+
/// performed without BOM.
108+
///
109+
/// IF encoding was not recognized, `None` is returned.
125110
///
126111
/// Because the [`encoding_rs`] crate supports only subset of those encodings, only
127112
/// the supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE.
@@ -131,25 +116,26 @@ pub(crate) fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'
131116
///
132117
/// | Bytes |Detected encoding
133118
/// |-------------|------------------------------------------
134-
/// |`FE FF ## ##`|UTF-16, big-endian
119+
/// | **BOM**
120+
/// |`FE_FF_##_##`|UTF-16, big-endian
135121
/// |`FF FE ## ##`|UTF-16, little-endian
136122
/// |`EF BB BF` |UTF-8
137-
/// |-------------|------------------------------------------
123+
/// | **No BOM**
138124
/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
139125
/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
140126
/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
141127
#[cfg(feature = "encoding")]
142-
pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
128+
pub fn detect_encoding(bytes: &[u8]) -> Option<(&'static Encoding, usize)> {
143129
match bytes {
144130
// with BOM
145-
_ if bytes.starts_with(UTF16_BE_BOM) => Some(UTF_16BE),
146-
_ if bytes.starts_with(UTF16_LE_BOM) => Some(UTF_16LE),
147-
_ if bytes.starts_with(UTF8_BOM) => Some(UTF_8),
131+
_ if bytes.starts_with(UTF16_BE_BOM) => Some((UTF_16BE, 2)),
132+
_ if bytes.starts_with(UTF16_LE_BOM) => Some((UTF_16LE, 2)),
133+
_ if bytes.starts_with(UTF8_BOM) => Some((UTF_8, 3)),
148134

149135
// without BOM
150-
_ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
151-
_ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(UTF_16LE), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
152-
_ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(UTF_8), // Some ASCII compatible
136+
_ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some((UTF_16BE, 0)), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
137+
_ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some((UTF_16LE, 0)), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
138+
_ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some((UTF_8, 0)), // Some ASCII compatible
153139

154140
_ => None,
155141
}

src/reader/async_tokio.rs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,13 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
8686
buf: &'b mut Vec<u8>,
8787
) -> Pin<Box<dyn Future<Output = Result<Event<'b>>> + 'reader>> {
8888
Box::pin(async move {
89-
read_event_impl!(self, buf, read_until_open_async, read_until_close_async, await)
89+
read_event_impl!(
90+
self, buf,
91+
TokioAdapter(&mut self.reader),
92+
read_until_open_async,
93+
read_until_close_async,
94+
await
95+
)
9096
})
9197
}
9298

@@ -148,15 +154,9 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
148154
Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await))
149155
}
150156

151-
/// Read until '<' is found and moves reader to an `Opened` state.
152-
///
153-
/// Return a `StartText` event if `first` is `true` and a `Text` event otherwise
154-
async fn read_until_open_async<'b>(
155-
&mut self,
156-
buf: &'b mut Vec<u8>,
157-
first: bool,
158-
) -> Result<Event<'b>> {
159-
read_until_open!(self, buf, first, TokioAdapter(&mut self.reader), read_event_into_async, await)
157+
/// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
158+
async fn read_until_open_async<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
159+
read_until_open!(self, buf, TokioAdapter(&mut self.reader), read_event_into_async, await)
160160
}
161161

162162
/// Private function to read until `>` is found. This function expects that

src/reader/buffered_reader.rs

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,40 @@ use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, Xml
1414

1515
macro_rules! impl_buffered_source {
1616
($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
17+
#[cfg(not(feature = "encoding"))]
18+
$($async)? fn remove_utf8_bom(&mut self) -> Result<()> {
19+
use crate::encoding::UTF8_BOM;
20+
21+
loop {
22+
break match self $(.$reader)? .fill_buf() $(.$await)? {
23+
Ok(n) => {
24+
if n.starts_with(UTF8_BOM) {
25+
self $(.$reader)? .consume(UTF8_BOM.len());
26+
}
27+
Ok(())
28+
},
29+
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
30+
Err(e) => Err(Error::Io(e)),
31+
};
32+
}
33+
}
34+
35+
#[cfg(feature = "encoding")]
36+
$($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
37+
loop {
38+
break match self $(.$reader)? .fill_buf() $(.$await)? {
39+
Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
40+
self $(.$reader)? .consume(bom_len);
41+
Ok(Some(enc))
42+
} else {
43+
Ok(None)
44+
},
45+
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
46+
Err(e) => Err(Error::Io(e)),
47+
};
48+
}
49+
}
50+
1751
#[inline]
1852
$($async)? fn read_bytes_until $(<$lf>)? (
1953
&mut self,
@@ -396,16 +430,14 @@ mod test {
396430
use pretty_assertions::assert_eq;
397431

398432
/// Checks that encoding is detected by BOM and changed after XML declaration
433+
/// BOM indicates UTF-16LE, but XML - windows-1251
399434
#[test]
400435
fn bom_detected() {
401436
let mut reader =
402437
Reader::from_reader(b"\xFF\xFE<?xml encoding='windows-1251'?>".as_ref());
403438
let mut buf = Vec::new();
404439

405440
assert_eq!(reader.decoder().encoding(), UTF_8);
406-
reader.read_event_into(&mut buf).unwrap();
407-
assert_eq!(reader.decoder().encoding(), UTF_16LE);
408-
409441
reader.read_event_into(&mut buf).unwrap();
410442
assert_eq!(reader.decoder().encoding(), WINDOWS_1251);
411443

src/reader/mod.rs

Lines changed: 65 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -139,13 +139,31 @@ macro_rules! configure_methods {
139139
macro_rules! read_event_impl {
140140
(
141141
$self:ident, $buf:ident,
142+
$reader:expr,
142143
$read_until_open:ident,
143144
$read_until_close:ident
144145
$(, $await:ident)?
145146
) => {{
146147
let event = match $self.parser.state {
147-
ParseState::Init => $self.$read_until_open($buf, true) $(.$await)?,
148-
ParseState::ClosedTag => $self.$read_until_open($buf, false) $(.$await)?,
148+
ParseState::Init => {
149+
// If encoding set explicitly, we not need to detect it. For example,
150+
// explicit UTF-8 set automatically if Reader was created using `from_str`.
151+
// But we still need to remove BOM for consistency with no encoding
152+
// feature enabled path
153+
#[cfg(feature = "encoding")]
154+
if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
155+
if $self.parser.encoding.can_be_refined() {
156+
$self.parser.encoding = crate::reader::EncodingRef::BomDetected(encoding);
157+
}
158+
}
159+
160+
// Removes UTF-8 BOM if it is present
161+
#[cfg(not(feature = "encoding"))]
162+
$reader.remove_utf8_bom() $(.$await)? ?;
163+
164+
$self.$read_until_open($buf) $(.$await)?
165+
},
166+
ParseState::ClosedTag => $self.$read_until_open($buf) $(.$await)?,
149167
ParseState::OpenedTag => $self.$read_until_close($buf) $(.$await)?,
150168
ParseState::Empty => $self.parser.close_expanded_empty(),
151169
ParseState::Exit => return Ok(Event::Eof),
@@ -160,7 +178,7 @@ macro_rules! read_event_impl {
160178

161179
macro_rules! read_until_open {
162180
(
163-
$self:ident, $buf:ident, $first:ident,
181+
$self:ident, $buf:ident,
164182
$reader:expr,
165183
$read_event:ident
166184
$(, $await:ident)?
@@ -180,7 +198,7 @@ macro_rules! read_until_open {
180198
.read_bytes_until(b'<', $buf, &mut $self.parser.offset)
181199
$(.$await)?
182200
{
183-
Ok(Some(bytes)) => $self.parser.read_text(bytes, $first),
201+
Ok(Some(bytes)) => $self.parser.read_text(bytes),
184202
Ok(None) => Ok(Event::Eof),
185203
Err(e) => Err(e),
186204
}
@@ -557,15 +575,15 @@ impl<R> Reader<R> {
557575
where
558576
R: XmlSource<'i, B>,
559577
{
560-
read_event_impl!(self, buf, read_until_open, read_until_close)
578+
read_event_impl!(self, buf, self.reader, read_until_open, read_until_close)
561579
}
562580

563581
/// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
564-
fn read_until_open<'i, B>(&mut self, buf: B, first: bool) -> Result<Event<'i>>
582+
fn read_until_open<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
565583
where
566584
R: XmlSource<'i, B>,
567585
{
568-
read_until_open!(self, buf, first, self.reader, read_event_impl)
586+
read_until_open!(self, buf, self.reader, read_event_impl)
569587
}
570588

571589
/// Private function to read until `>` is found. This function expects that
@@ -595,6 +613,14 @@ impl<R> Reader<R> {
595613
/// - `B`: a type of a buffer that can be used to store data read from `Self` and
596614
/// from which events can borrow
597615
trait XmlSource<'r, B> {
616+
/// Removes UTF-8 BOM if it is present
617+
#[cfg(not(feature = "encoding"))]
618+
fn remove_utf8_bom(&mut self) -> Result<()>;
619+
620+
/// Determines encoding from the start of input and removes BOM if it is present
621+
#[cfg(feature = "encoding")]
622+
fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>>;
623+
598624
/// Read input until `byte` is found or end of input is reached.
599625
///
600626
/// Returns a slice of data read up to `byte`, which does not include into result.
@@ -1579,10 +1605,39 @@ mod test {
15791605
use crate::reader::Reader;
15801606
use pretty_assertions::assert_eq;
15811607

1608+
/// When `encoding` feature is enabled, encoding should be detected
1609+
/// from BOM (UTF-8) and BOM should be stripped.
1610+
///
1611+
/// When `encoding` feature is disabled, UTF-8 is assumed and BOM
1612+
/// character should be stripped for consistency
1613+
#[$test]
1614+
$($async)? fn bom_from_reader() {
1615+
let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
1616+
1617+
assert_eq!(
1618+
reader.$read_event($buf) $(.$await)? .unwrap(),
1619+
Event::Text(BytesText::from_escaped("\u{feff}"))
1620+
);
1621+
1622+
assert_eq!(
1623+
reader.$read_event($buf) $(.$await)? .unwrap(),
1624+
Event::Eof
1625+
);
1626+
}
1627+
1628+
/// When parsing from &str, encoding is fixed (UTF-8), so
1629+
/// - when `encoding` feature is disabled, the behavior the
1630+
/// same as in `bom_from_reader` text
1631+
/// - when `encoding` feature is enabled, the behavior should
1632+
/// stay consistent, so the first BOM character is stripped
15821633
#[$test]
1583-
#[should_panic] // Failure is expected until read_until_open() is smart enough to skip over irrelevant text events.
1584-
$($async)? fn bom_at_start() {
1585-
let mut reader = Reader::from_str("\u{feff}");
1634+
$($async)? fn bom_from_str() {
1635+
let mut reader = Reader::from_str("\u{feff}\u{feff}");
1636+
1637+
assert_eq!(
1638+
reader.$read_event($buf) $(.$await)? .unwrap(),
1639+
Event::Text(BytesText::from_escaped("\u{feff}"))
1640+
);
15861641

15871642
assert_eq!(
15881643
reader.$read_event($buf) $(.$await)? .unwrap(),

src/reader/parser.rs

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#[cfg(feature = "encoding")]
22
use encoding_rs::UTF_8;
33

4-
use crate::encoding::{self, Decoder};
4+
use crate::encoding::Decoder;
55
use crate::errors::{Error, Result};
66
use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
77
#[cfg(feature = "encoding")]
@@ -63,11 +63,9 @@ impl Parser {
6363
///
6464
/// # Parameters
6565
/// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
66-
/// - `first`: if `true`, then this is the first call of that function,
67-
/// i. e. data from the start of stream and the bytes will be checked for a BOM.
6866
///
6967
/// [`Text`]: Event::Text
70-
pub fn read_text<'b>(&mut self, bytes: &'b [u8], first: bool) -> Result<Event<'b>> {
68+
pub fn read_text<'b>(&mut self, bytes: &'b [u8]) -> Result<Event<'b>> {
7169
let mut content = bytes;
7270

7371
if self.trim_text_end {
@@ -79,20 +77,6 @@ impl Parser {
7977
content = &bytes[..len];
8078
}
8179

82-
if first {
83-
#[cfg(feature = "encoding")]
84-
if self.encoding.can_be_refined() {
85-
if let Some(encoding) = encoding::detect_encoding(bytes) {
86-
self.encoding = EncodingRef::BomDetected(encoding);
87-
content = encoding::remove_bom(content, encoding);
88-
}
89-
}
90-
#[cfg(not(feature = "encoding"))]
91-
if bytes.starts_with(encoding::UTF8_BOM) {
92-
content = &bytes[encoding::UTF8_BOM.len()..];
93-
}
94-
}
95-
9680
Ok(Event::Text(BytesText::wrap(content, self.decoder())))
9781
}
9882

src/reader/slice_reader.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use std::borrow::Cow;
77
#[cfg(feature = "encoding")]
88
use crate::reader::EncodingRef;
99
#[cfg(feature = "encoding")]
10-
use encoding_rs::UTF_8;
10+
use encoding_rs::{Encoding, UTF_8};
1111

1212
use crate::errors::{Error, Result};
1313
use crate::events::Event;
@@ -236,6 +236,23 @@ impl<'a> Reader<&'a [u8]> {
236236
/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
237237
/// that will be borrowed by events. This implementation provides a zero-copy deserialization
238238
impl<'a> XmlSource<'a, ()> for &'a [u8] {
239+
#[cfg(not(feature = "encoding"))]
240+
fn remove_utf8_bom(&mut self) -> Result<()> {
241+
if self.starts_with(crate::encoding::UTF8_BOM) {
242+
*self = &self[crate::encoding::UTF8_BOM.len()..];
243+
}
244+
Ok(())
245+
}
246+
247+
#[cfg(feature = "encoding")]
248+
fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>> {
249+
if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) {
250+
*self = &self[bom_len..];
251+
return Ok(Some(enc));
252+
}
253+
Ok(None)
254+
}
255+
239256
fn read_bytes_until(
240257
&mut self,
241258
byte: u8,

tests/encodings.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,11 @@ mod decode {
1818
#[test]
1919
fn test_detect_encoding() {
2020
// No BOM
21-
assert_eq!(detect_encoding(UTF8_TEXT.as_bytes()), Some(UTF_8));
21+
assert_eq!(detect_encoding(UTF8_TEXT.as_bytes()), Some((UTF_8, 0)));
2222
// BOM
23-
assert_eq!(detect_encoding(UTF8_TEXT_WITH_BOM), Some(UTF_8));
24-
assert_eq!(detect_encoding(UTF16BE_TEXT_WITH_BOM), Some(UTF_16BE));
25-
assert_eq!(detect_encoding(UTF16LE_TEXT_WITH_BOM), Some(UTF_16LE));
23+
assert_eq!(detect_encoding(UTF8_TEXT_WITH_BOM), Some((UTF_8, 3)));
24+
assert_eq!(detect_encoding(UTF16BE_TEXT_WITH_BOM), Some((UTF_16BE, 2)));
25+
assert_eq!(detect_encoding(UTF16LE_TEXT_WITH_BOM), Some((UTF_16LE, 2)));
2626
}
2727
}
2828

0 commit comments

Comments
 (0)