Skip to content

Commit 7f34520

Browse files
Mingundralley
authored andcommitted
Add tests for encoding detection
`test-gen` is a project used to generate test files in tests/documents/encoding directory failures (2): detect::utf16be detect::utf16le
1 parent d7dae47 commit 7f34520

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+1005
-0
lines changed

.gitattributes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
/tests/documents/* text eol=lf
33
/tests/documents/encoding/* text eol=lf
44

5+
/tests/documents/encoding/utf16be.xml binary
6+
/tests/documents/encoding/utf16le.xml binary
57
/tests/documents/encoding/utf16be-bom.xml binary
68
/tests/documents/encoding/utf16le-bom.xml binary
79
/tests/documents/sample_5_utf16bom.xml binary

.gitmodules

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[submodule "encoding"]
2+
path = test-gen/encoding
3+
url = https://github.com/whatwg/encoding.git
4+
shallow = true

test-gen/Cargo.toml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[package]
2+
name = "test-gen"
3+
version = "0.1.0"
4+
edition = "2021"
5+
6+
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7+
8+
[dependencies]
9+
encoding_rs = "0.8"
10+
serde = { version = "1.0", features = ["derive"] }
11+
serde_json = "1.0"

test-gen/encoding

Submodule encoding added at 4f549cd

test-gen/src/main.rs

Lines changed: 290 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
use encoding_rs::*;
2+
use serde::Deserialize;
3+
use serde_json::from_reader;
4+
use std::collections::BTreeMap;
5+
use std::fs::{write, File};
6+
7+
type Index = Vec<Option<u32>>;
8+
9+
/// Representation of https://github.com/whatwg/encoding/blob/main/indexes.json
10+
///
11+
/// `ASCII = \u{0000}..=\u{007F}`
12+
#[derive(Debug, Deserialize)]
13+
#[serde(rename_all = "kebab-case")]
14+
struct Indexes {
15+
/// List of pairs _(index, codepoint)_.
16+
///
17+
/// Unused by the generator, included to prevent getting into `single_byte`
18+
gb18030_ranges: Vec<(usize, u32)>,
19+
20+
/// Normalization table of code points in the range `\u{FF61}` to `\u{FF9F}`
21+
/// for `ISO-2022-JP` encoding.
22+
///
23+
/// First entry in the vector is a normalized value for `\u{FF61}`, the last
24+
/// is for `\u{FF9F}` (63 entries).
25+
///
26+
/// Unused by the generator, included to prevent getting into `single_byte`
27+
iso_2022_jp_katakana: Vec<u32>,
28+
29+
/// List of code points that can be encoded by the [`BIG5`] encoding.
30+
///
31+
/// ```text
32+
/// ASCII + big5[((0xA1 - 0x81) * 157)..]
33+
/// ```
34+
/// <https://encoding.spec.whatwg.org/#big5-encoder>
35+
big5: Index,
36+
37+
/// List of code points that can be encoded by the [`EUC_KR`] encoding.
38+
///
39+
/// ```text
40+
/// ASCII + EUC-KR table
41+
/// ```
42+
/// <https://encoding.spec.whatwg.org/#euc-kr-encoder>
43+
euc_kr: Index,
44+
45+
/// List of code points that can be encoded by the following encoding:
46+
///
47+
/// ## [`GBK`]
48+
/// ```text
49+
/// ASCII + gb18030 table - U+E5E5
50+
/// ```
51+
/// <https://encoding.spec.whatwg.org/#gb18030-encoder>
52+
///
53+
/// ## [`GB18030`]
54+
/// ```text
55+
/// all Unicode - U+E5E5
56+
/// ```
57+
/// <https://encoding.spec.whatwg.org/#gb18030-encoder>
58+
gb18030: Index,
59+
60+
/// List of code points that can be encoded by the following encoding:
61+
///
62+
/// ## [`EUC_JP`]
63+
/// ```text
64+
/// ASCII + U+00A5 + U+203E + U+FF61..=U+FF9F + U+2212 + jis0208 table (== ISO_2022_JP)
65+
/// ```
66+
/// <https://encoding.spec.whatwg.org/#euc-jp-encoder>
67+
///
68+
/// ## [`ISO_2022_JP`]
69+
/// ```text
70+
/// ASCII + U+00A5 + U+203E + U+2212 + U+FF61..=U+FF9F + jis0208 table (== EUC_JP)
71+
/// ```
72+
/// <https://encoding.spec.whatwg.org/#iso-2022-jp-encoder>
73+
///
74+
/// ## [`SHIFT_JIS`]
75+
/// ```text
76+
/// ASCII + U+0080 + U+00A5 + U+203E + U+FF61..=U+FF9F + U+2212 + jis0208 table
77+
/// (without jis0208[8272..=8835], but that slice contains code points that duplicated
78+
/// in the other part of that table)
79+
/// ```
80+
/// <https://encoding.spec.whatwg.org/#shift_jis-encoder>
81+
jis0208: Index,
82+
83+
/// Unused by the generator, included to prevent getting into `single_byte`
84+
jis0212: Index,
85+
86+
/// List of code points that can be encoded by the single-byte encodings.
87+
///
88+
/// ```text
89+
/// ASCII + corresponding table.
90+
/// ```
91+
///
92+
/// <https://encoding.spec.whatwg.org/#single-byte-encoder>
93+
#[serde(flatten)]
94+
single_byte: BTreeMap<String, Index>,
95+
}
96+
97+
/// > XML 1.1 allows the use of character references to the control characters
98+
/// > #x1 through #x1F, most of which are forbidden in XML 1.0. For reasons of
99+
/// > robustness, however, these characters still cannot be used directly in
100+
/// > documents. In order to improve the robustness of character encoding detection,
101+
/// > the additional control characters #x7F through #x9F, which were freely allowed
102+
/// > in XML 1.0 documents, now must also appear only as character references.
103+
/// > (Whitespace characters are of course exempt.)
104+
///
105+
/// https://www.w3.org/TR/xml11/#sec-xml11
106+
fn is_literal_xml11_char(ch: char) -> bool {
107+
// https://www.w3.org/TR/xml11/#NT-Char
108+
match ch {
109+
'\u{0001}'..='\u{D7FF}' => match ch {
110+
// These chars can only appear as character references
111+
// https://www.w3.org/TR/xml11/#NT-RestrictedChar
112+
'\u{0001}'..='\u{0008}' => false,
113+
'\u{000B}'..='\u{000C}' => false,
114+
'\u{000E}'..='\u{001F}' => false,
115+
'\u{007F}'..='\u{0084}' => false,
116+
'\u{0086}'..='\u{009F}' => false,
117+
_ => true,
118+
},
119+
'\u{E000}'..='\u{FFFD}' => true,
120+
'\u{10000}'..='\u{10FFFF}' => true,
121+
_ => false,
122+
}
123+
}
124+
125+
/// Almost all characters can form a name. Citation from <https://www.w3.org/TR/xml11/#sec-xml11>:
126+
///
127+
/// > The overall philosophy of names has changed since XML 1.0. Whereas XML 1.0
128+
/// > provided a rigid definition of names, wherein everything that was not permitted
129+
/// > was forbidden, XML 1.1 names are designed so that everything that is not
130+
/// > forbidden (for a specific reason) is permitted. Since Unicode will continue
131+
/// > to grow past version 4.0, further changes to XML can be avoided by allowing
132+
/// > almost any character, including those not yet assigned, in names.
133+
///
134+
/// <https://www.w3.org/TR/xml11/#NT-NameStartChar>
135+
fn is_xml11_name_start_char(ch: char) -> bool {
136+
match ch {
137+
':'
138+
| 'A'..='Z'
139+
| '_'
140+
| 'a'..='z'
141+
| '\u{00C0}'..='\u{00D6}'
142+
| '\u{00D8}'..='\u{00F6}'
143+
| '\u{00F8}'..='\u{02FF}'
144+
| '\u{0370}'..='\u{037D}'
145+
| '\u{037F}'..='\u{1FFF}'
146+
| '\u{200C}'..='\u{200D}'
147+
| '\u{2070}'..='\u{218F}'
148+
| '\u{2C00}'..='\u{2FEF}'
149+
| '\u{3001}'..='\u{D7FF}'
150+
| '\u{F900}'..='\u{FDCF}'
151+
| '\u{FDF0}'..='\u{FFFD}'
152+
| '\u{10000}'..='\u{EFFFF}' => true,
153+
_ => false,
154+
}
155+
}
156+
157+
fn make_alphabet<I>(enc: &'static Encoding, codepoints: I) -> String
158+
where
159+
I: IntoIterator<Item = char>,
160+
{
161+
let iter = codepoints.into_iter();
162+
let mut alphabet = String::with_capacity(iter.size_hint().1.unwrap_or(256) * 4);
163+
// ASCII bytes (0x00 - 0x7F) does not included in encoding tables
164+
for ch in '\u{0000}'..='\u{007F}' {
165+
if is_literal_xml11_char(ch) {
166+
alphabet.push(ch);
167+
}
168+
}
169+
for (pointer, cp) in iter.enumerate() {
170+
// BIG5 encoding has unmappable code points in their index
171+
// https://github.com/whatwg/encoding/issues/293
172+
//
173+
// 0-5023 - pointers of unmapped characters (0x8140-0xA13F in Big5)
174+
// 5024 - pointer of a U+3000 (0xA140 in Big5)
175+
if enc == BIG5 && pointer < 5024 {
176+
continue;
177+
}
178+
// SHIFT_JIS: codepoints[8272..=8835] should be excluded
179+
// https://encoding.spec.whatwg.org/#index-shift_jis-pointer
180+
if enc == SHIFT_JIS && (8272..=8835).contains(&pointer) {
181+
continue;
182+
}
183+
184+
if is_literal_xml11_char(cp) {
185+
alphabet.push(cp);
186+
}
187+
}
188+
alphabet
189+
}
190+
191+
fn make_xml<I>(enc: &'static Encoding, codepoints: I)
192+
where
193+
I: IntoIterator<Item = char>,
194+
{
195+
println!(
196+
"{} - single:{}, ascii:{}",
197+
enc.name(),
198+
enc.is_single_byte(),
199+
enc.is_ascii_compatible()
200+
);
201+
println!(" - making alphabet");
202+
203+
let alphabet = make_alphabet(enc, codepoints);
204+
205+
println!(" - making xml");
206+
207+
let name = alphabet.replace(|ch| !is_xml11_name_start_char(ch), "");
208+
let xml = format!(
209+
r#"<?xml version="1.1" encoding="{encoding}"?>
210+
<!--This is generated file. Edit <quick-xml>/test-gen/src/main.rs instead-->
211+
<root attribute1="{attr1}"
212+
attribute2='{attr2}'
213+
{attr_name}={attr3}
214+
>
215+
<?{pi}?>
216+
<!--{comment}-->
217+
{text}
218+
<ns:{element} ns:attribute="value1" xmlns:ns="namespace"/>
219+
<![CDATA[{text}]]>
220+
</root>"#,
221+
encoding = enc.name(),
222+
// https://www.w3.org/TR/xml11/#NT-AttValue
223+
attr1 = alphabet.replace(|ch| matches!(ch, '<' | '&' | '"'), ""),
224+
attr2 = alphabet.replace(|ch| matches!(ch, '<' | '&' | '\''), ""),
225+
attr_name = name,
226+
attr3 = name,
227+
pi = name,
228+
comment = alphabet,
229+
// https://www.w3.org/TR/xml11/#dt-chardata
230+
text = alphabet.replace(|ch| matches!(ch, '<' | '&'), ""),
231+
element = name,
232+
);
233+
234+
println!(
235+
" - encode and write ../tests/documents/encoding/{}.xml",
236+
enc.name()
237+
);
238+
let (result, actual, _) = enc.encode(&xml);
239+
if enc == actual && enc != UTF_8 {
240+
write(
241+
format!("../tests/documents/encoding/{}.xml", enc.name()),
242+
result,
243+
)
244+
.unwrap();
245+
}
246+
}
247+
fn process_index(enc: &'static Encoding, codepoints: &Index) {
248+
make_xml(
249+
enc,
250+
codepoints.into_iter().filter_map(|cp| {
251+
// `char` cannot be deserialized from integer in JSON directly
252+
cp.map(|cp| char::from_u32(cp).expect(&format!("`{}` is not a code point", cp)))
253+
}),
254+
)
255+
}
256+
257+
/// Generates test files in {quick-xml}/tests/documents/encoding/{}.xml
258+
fn main() {
259+
let index = "encoding/indexes.json";
260+
let file = File::open(index).expect(&format!(
261+
r#"unable to load `{}`. Probably `encoding` submodule does not fetched? Try to run
262+
263+
git submodule update --init -- encoding
264+
265+
in the current working dir (i. e. <quick-xml>/test-gen/)
266+
"#,
267+
index
268+
));
269+
let indexes: Indexes = from_reader(file).expect(&format!("invalid format of `{}`", index));
270+
271+
process_index(BIG5, &indexes.big5);
272+
process_index(EUC_KR, &indexes.euc_kr);
273+
274+
process_index(GBK, &indexes.gb18030);
275+
// It is too expensive to generate full Unicode alphabet, but at least pass significant part of them
276+
process_index(GB18030, &indexes.gb18030);
277+
278+
process_index(EUC_JP, &indexes.jis0208);
279+
process_index(ISO_2022_JP, &indexes.jis0208);
280+
process_index(SHIFT_JIS, &indexes.jis0208);
281+
282+
for (label, codepoints) in indexes.single_byte.into_iter() {
283+
let enc = Encoding::for_label(label.as_bytes())
284+
.expect(&format!("label `{}` is unsupported", label));
285+
286+
process_index(enc, &codepoints);
287+
}
288+
// https://encoding.spec.whatwg.org/#x-user-defined-decoder
289+
make_xml(X_USER_DEFINED, '\u{F780}'..='\u{F7FF}');
290+
}

tests/documents/encoding/Big5.xml

Lines changed: 17 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)