|
| 1 | +use encoding_rs::*; |
| 2 | +use serde::Deserialize; |
| 3 | +use serde_json::from_reader; |
| 4 | +use std::collections::BTreeMap; |
| 5 | +use std::fs::{write, File}; |
| 6 | + |
| 7 | +type Index = Vec<Option<u32>>; |
| 8 | + |
| 9 | +/// Representation of https://github.com/whatwg/encoding/blob/main/indexes.json |
| 10 | +/// |
| 11 | +/// `ASCII = \u{0000}..=\u{007F}` |
| 12 | +#[derive(Debug, Deserialize)] |
| 13 | +#[serde(rename_all = "kebab-case")] |
| 14 | +struct Indexes { |
| 15 | + /// List of pairs _(index, codepoint)_. |
| 16 | + /// |
| 17 | + /// Unused by the generator, included to prevent getting into `single_byte` |
| 18 | + gb18030_ranges: Vec<(usize, u32)>, |
| 19 | + |
| 20 | + /// Normalization table of code points in the range `\u{FF61}` to `\u{FF9F}` |
| 21 | + /// for `ISO-2022-JP` encoding. |
| 22 | + /// |
| 23 | + /// First entry in the vector is a normalized value for `\u{FF61}`, the last |
| 24 | + /// is for `\u{FF9F}` (63 entries). |
| 25 | + /// |
| 26 | + /// Unused by the generator, included to prevent getting into `single_byte` |
| 27 | + iso_2022_jp_katakana: Vec<u32>, |
| 28 | + |
| 29 | + /// List of code points that can be encoded by the [`BIG5`] encoding. |
| 30 | + /// |
| 31 | + /// ```text |
| 32 | + /// ASCII + big5[((0xA1 - 0x81) * 157)..] |
| 33 | + /// ``` |
| 34 | + /// <https://encoding.spec.whatwg.org/#big5-encoder> |
| 35 | + big5: Index, |
| 36 | + |
| 37 | + /// List of code points that can be encoded by the [`EUC_KR`] encoding. |
| 38 | + /// |
| 39 | + /// ```text |
| 40 | + /// ASCII + EUC-KR table |
| 41 | + /// ``` |
| 42 | + /// <https://encoding.spec.whatwg.org/#euc-kr-encoder> |
| 43 | + euc_kr: Index, |
| 44 | + |
| 45 | + /// List of code points that can be encoded by the following encoding: |
| 46 | + /// |
| 47 | + /// ## [`GBK`] |
| 48 | + /// ```text |
| 49 | + /// ASCII + gb18030 table - U+E5E5 |
| 50 | + /// ``` |
| 51 | + /// <https://encoding.spec.whatwg.org/#gb18030-encoder> |
| 52 | + /// |
| 53 | + /// ## [`GB18030`] |
| 54 | + /// ```text |
| 55 | + /// all Unicode - U+E5E5 |
| 56 | + /// ``` |
| 57 | + /// <https://encoding.spec.whatwg.org/#gb18030-encoder> |
| 58 | + gb18030: Index, |
| 59 | + |
| 60 | + /// List of code points that can be encoded by the following encoding: |
| 61 | + /// |
| 62 | + /// ## [`EUC_JP`] |
| 63 | + /// ```text |
| 64 | + /// ASCII + U+00A5 + U+203E + U+FF61..=U+FF9F + U+2212 + jis0208 table (== ISO_2022_JP) |
| 65 | + /// ``` |
| 66 | + /// <https://encoding.spec.whatwg.org/#euc-jp-encoder> |
| 67 | + /// |
| 68 | + /// ## [`ISO_2022_JP`] |
| 69 | + /// ```text |
| 70 | + /// ASCII + U+00A5 + U+203E + U+2212 + U+FF61..=U+FF9F + jis0208 table (== EUC_JP) |
| 71 | + /// ``` |
| 72 | + /// <https://encoding.spec.whatwg.org/#iso-2022-jp-encoder> |
| 73 | + /// |
| 74 | + /// ## [`SHIFT_JIS`] |
| 75 | + /// ```text |
| 76 | + /// ASCII + U+0080 + U+00A5 + U+203E + U+FF61..=U+FF9F + U+2212 + jis0208 table |
| 77 | + /// (without jis0208[8272..=8835], but that slice contains code points that duplicated |
| 78 | + /// in the other part of that table) |
| 79 | + /// ``` |
| 80 | + /// <https://encoding.spec.whatwg.org/#shift_jis-encoder> |
| 81 | + jis0208: Index, |
| 82 | + |
| 83 | + /// Unused by the generator, included to prevent getting into `single_byte` |
| 84 | + jis0212: Index, |
| 85 | + |
| 86 | + /// List of code points that can be encoded by the single-byte encodings. |
| 87 | + /// |
| 88 | + /// ```text |
| 89 | + /// ASCII + corresponding table. |
| 90 | + /// ``` |
| 91 | + /// |
| 92 | + /// <https://encoding.spec.whatwg.org/#single-byte-encoder> |
| 93 | + #[serde(flatten)] |
| 94 | + single_byte: BTreeMap<String, Index>, |
| 95 | +} |
| 96 | + |
| 97 | +/// > XML 1.1 allows the use of character references to the control characters |
| 98 | +/// > #x1 through #x1F, most of which are forbidden in XML 1.0. For reasons of |
| 99 | +/// > robustness, however, these characters still cannot be used directly in |
| 100 | +/// > documents. In order to improve the robustness of character encoding detection, |
| 101 | +/// > the additional control characters #x7F through #x9F, which were freely allowed |
| 102 | +/// > in XML 1.0 documents, now must also appear only as character references. |
| 103 | +/// > (Whitespace characters are of course exempt.) |
| 104 | +/// |
| 105 | +/// https://www.w3.org/TR/xml11/#sec-xml11 |
| 106 | +fn is_literal_xml11_char(ch: char) -> bool { |
| 107 | + // https://www.w3.org/TR/xml11/#NT-Char |
| 108 | + match ch { |
| 109 | + '\u{0001}'..='\u{D7FF}' => match ch { |
| 110 | + // These chars can only appear as character references |
| 111 | + // https://www.w3.org/TR/xml11/#NT-RestrictedChar |
| 112 | + '\u{0001}'..='\u{0008}' => false, |
| 113 | + '\u{000B}'..='\u{000C}' => false, |
| 114 | + '\u{000E}'..='\u{001F}' => false, |
| 115 | + '\u{007F}'..='\u{0084}' => false, |
| 116 | + '\u{0086}'..='\u{009F}' => false, |
| 117 | + _ => true, |
| 118 | + }, |
| 119 | + '\u{E000}'..='\u{FFFD}' => true, |
| 120 | + '\u{10000}'..='\u{10FFFF}' => true, |
| 121 | + _ => false, |
| 122 | + } |
| 123 | +} |
| 124 | + |
| 125 | +/// Almost all characters can form a name. Citation from <https://www.w3.org/TR/xml11/#sec-xml11>: |
| 126 | +/// |
| 127 | +/// > The overall philosophy of names has changed since XML 1.0. Whereas XML 1.0 |
| 128 | +/// > provided a rigid definition of names, wherein everything that was not permitted |
| 129 | +/// > was forbidden, XML 1.1 names are designed so that everything that is not |
| 130 | +/// > forbidden (for a specific reason) is permitted. Since Unicode will continue |
| 131 | +/// > to grow past version 4.0, further changes to XML can be avoided by allowing |
| 132 | +/// > almost any character, including those not yet assigned, in names. |
| 133 | +/// |
| 134 | +/// <https://www.w3.org/TR/xml11/#NT-NameStartChar> |
| 135 | +fn is_xml11_name_start_char(ch: char) -> bool { |
| 136 | + match ch { |
| 137 | + ':' |
| 138 | + | 'A'..='Z' |
| 139 | + | '_' |
| 140 | + | 'a'..='z' |
| 141 | + | '\u{00C0}'..='\u{00D6}' |
| 142 | + | '\u{00D8}'..='\u{00F6}' |
| 143 | + | '\u{00F8}'..='\u{02FF}' |
| 144 | + | '\u{0370}'..='\u{037D}' |
| 145 | + | '\u{037F}'..='\u{1FFF}' |
| 146 | + | '\u{200C}'..='\u{200D}' |
| 147 | + | '\u{2070}'..='\u{218F}' |
| 148 | + | '\u{2C00}'..='\u{2FEF}' |
| 149 | + | '\u{3001}'..='\u{D7FF}' |
| 150 | + | '\u{F900}'..='\u{FDCF}' |
| 151 | + | '\u{FDF0}'..='\u{FFFD}' |
| 152 | + | '\u{10000}'..='\u{EFFFF}' => true, |
| 153 | + _ => false, |
| 154 | + } |
| 155 | +} |
| 156 | + |
| 157 | +fn make_alphabet<I>(enc: &'static Encoding, codepoints: I) -> String |
| 158 | +where |
| 159 | + I: IntoIterator<Item = char>, |
| 160 | +{ |
| 161 | + let iter = codepoints.into_iter(); |
| 162 | + let mut alphabet = String::with_capacity(iter.size_hint().1.unwrap_or(256) * 4); |
| 163 | + // ASCII bytes (0x00 - 0x7F) does not included in encoding tables |
| 164 | + for ch in '\u{0000}'..='\u{007F}' { |
| 165 | + if is_literal_xml11_char(ch) { |
| 166 | + alphabet.push(ch); |
| 167 | + } |
| 168 | + } |
| 169 | + for (pointer, cp) in iter.enumerate() { |
| 170 | + // BIG5 encoding has unmappable code points in their index |
| 171 | + // https://github.com/whatwg/encoding/issues/293 |
| 172 | + // |
| 173 | + // 0-5023 - pointers of unmapped characters (0x8140-0xA13F in Big5) |
| 174 | + // 5024 - pointer of a U+3000 (0xA140 in Big5) |
| 175 | + if enc == BIG5 && pointer < 5024 { |
| 176 | + continue; |
| 177 | + } |
| 178 | + // SHIFT_JIS: codepoints[8272..=8835] should be excluded |
| 179 | + // https://encoding.spec.whatwg.org/#index-shift_jis-pointer |
| 180 | + if enc == SHIFT_JIS && (8272..=8835).contains(&pointer) { |
| 181 | + continue; |
| 182 | + } |
| 183 | + |
| 184 | + if is_literal_xml11_char(cp) { |
| 185 | + alphabet.push(cp); |
| 186 | + } |
| 187 | + } |
| 188 | + alphabet |
| 189 | +} |
| 190 | + |
| 191 | +fn make_xml<I>(enc: &'static Encoding, codepoints: I) |
| 192 | +where |
| 193 | + I: IntoIterator<Item = char>, |
| 194 | +{ |
| 195 | + println!( |
| 196 | + "{} - single:{}, ascii:{}", |
| 197 | + enc.name(), |
| 198 | + enc.is_single_byte(), |
| 199 | + enc.is_ascii_compatible() |
| 200 | + ); |
| 201 | + println!(" - making alphabet"); |
| 202 | + |
| 203 | + let alphabet = make_alphabet(enc, codepoints); |
| 204 | + |
| 205 | + println!(" - making xml"); |
| 206 | + |
| 207 | + let name = alphabet.replace(|ch| !is_xml11_name_start_char(ch), ""); |
| 208 | + let xml = format!( |
| 209 | + r#"<?xml version="1.1" encoding="{encoding}"?> |
| 210 | +<!--This is generated file. Edit <quick-xml>/test-gen/src/main.rs instead--> |
| 211 | +<root attribute1="{attr1}" |
| 212 | + attribute2='{attr2}' |
| 213 | + {attr_name}={attr3} |
| 214 | +> |
| 215 | + <?{pi}?> |
| 216 | + <!--{comment}--> |
| 217 | + {text} |
| 218 | + <ns:{element} ns:attribute="value1" xmlns:ns="namespace"/> |
| 219 | + <![CDATA[{text}]]> |
| 220 | +</root>"#, |
| 221 | + encoding = enc.name(), |
| 222 | + // https://www.w3.org/TR/xml11/#NT-AttValue |
| 223 | + attr1 = alphabet.replace(|ch| matches!(ch, '<' | '&' | '"'), ""), |
| 224 | + attr2 = alphabet.replace(|ch| matches!(ch, '<' | '&' | '\''), ""), |
| 225 | + attr_name = name, |
| 226 | + attr3 = name, |
| 227 | + pi = name, |
| 228 | + comment = alphabet, |
| 229 | + // https://www.w3.org/TR/xml11/#dt-chardata |
| 230 | + text = alphabet.replace(|ch| matches!(ch, '<' | '&'), ""), |
| 231 | + element = name, |
| 232 | + ); |
| 233 | + |
| 234 | + println!( |
| 235 | + " - encode and write ../tests/documents/encoding/{}.xml", |
| 236 | + enc.name() |
| 237 | + ); |
| 238 | + let (result, actual, _) = enc.encode(&xml); |
| 239 | + if enc == actual && enc != UTF_8 { |
| 240 | + write( |
| 241 | + format!("../tests/documents/encoding/{}.xml", enc.name()), |
| 242 | + result, |
| 243 | + ) |
| 244 | + .unwrap(); |
| 245 | + } |
| 246 | +} |
| 247 | +fn process_index(enc: &'static Encoding, codepoints: &Index) { |
| 248 | + make_xml( |
| 249 | + enc, |
| 250 | + codepoints.into_iter().filter_map(|cp| { |
| 251 | + // `char` cannot be deserialized from integer in JSON directly |
| 252 | + cp.map(|cp| char::from_u32(cp).expect(&format!("`{}` is not a code point", cp))) |
| 253 | + }), |
| 254 | + ) |
| 255 | +} |
| 256 | + |
| 257 | +/// Generates test files in {quick-xml}/tests/documents/encoding/{}.xml |
| 258 | +fn main() { |
| 259 | + let index = "encoding/indexes.json"; |
| 260 | + let file = File::open(index).expect(&format!( |
| 261 | + r#"unable to load `{}`. Probably `encoding` submodule does not fetched? Try to run |
| 262 | +
|
| 263 | + git submodule update --init -- encoding |
| 264 | +
|
| 265 | + in the current working dir (i. e. <quick-xml>/test-gen/) |
| 266 | + "#, |
| 267 | + index |
| 268 | + )); |
| 269 | + let indexes: Indexes = from_reader(file).expect(&format!("invalid format of `{}`", index)); |
| 270 | + |
| 271 | + process_index(BIG5, &indexes.big5); |
| 272 | + process_index(EUC_KR, &indexes.euc_kr); |
| 273 | + |
| 274 | + process_index(GBK, &indexes.gb18030); |
| 275 | + // It is too expensive to generate full Unicode alphabet, but at least pass significant part of them |
| 276 | + process_index(GB18030, &indexes.gb18030); |
| 277 | + |
| 278 | + process_index(EUC_JP, &indexes.jis0208); |
| 279 | + process_index(ISO_2022_JP, &indexes.jis0208); |
| 280 | + process_index(SHIFT_JIS, &indexes.jis0208); |
| 281 | + |
| 282 | + for (label, codepoints) in indexes.single_byte.into_iter() { |
| 283 | + let enc = Encoding::for_label(label.as_bytes()) |
| 284 | + .expect(&format!("label `{}` is unsupported", label)); |
| 285 | + |
| 286 | + process_index(enc, &codepoints); |
| 287 | + } |
| 288 | + // https://encoding.spec.whatwg.org/#x-user-defined-decoder |
| 289 | + make_xml(X_USER_DEFINED, '\u{F780}'..='\u{F7FF}'); |
| 290 | +} |
0 commit comments