Skip to content

Commit ed32e67

Browse files
committed
WIP: various unescape strategies.
1 parent 6d8ccb6 commit ed32e67

File tree

8 files changed

+268
-69
lines changed

8 files changed

+268
-69
lines changed

Cargo.lock

Lines changed: 35 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@ rustdoc-args = ["--cfg", "docsrs"]
2121
default = []
2222
unescape = ["_unescape_either", "dep:hashify", "dep:serde_json"]
2323
unescape_fast = ["_unescape_either", "dep:matchgen", "dep:serde_json"]
24+
unescape_phf = ["_unescape_either", "entities"]
25+
unescape_quick = ["_unescape_either", "entities_quick"]
2426
entities = ["dep:phf", "dep:phf_codegen", "dep:serde_json"]
27+
entities_quick = ["dep:quickphf", "dep:quickphf_codegen", "dep:serde_json"]
2528
# Enable iai benchmarks
2629
iai = []
2730
# Make internal functions like unescape_fast public for benchmarks.
@@ -33,13 +36,15 @@ _unescape_either = []
3336
matchgen = { version = "0.4.0", optional = true }
3437
phf = { version = "0.11.1", default-features = false, optional = true }
3538
phf_codegen = { version = "0.11.1", optional = true }
39+
quickphf_codegen = { git = "https://github.com/AlexTMjugador/quickphf.git", branch = "feat/support-array-key-vals", optional = true }
3640
serde_json = { version = "1.0", optional = true }
3741

3842
[dependencies]
3943
hashify = { version = "0.2.6", optional = true }
4044
memchr = "2.5.0"
4145
pastey = "0.1.0"
4246
phf = { version = "0.11.1", default-features = false, optional = true }
47+
quickphf = { git = "https://github.com/AlexTMjugador/quickphf.git", branch = "feat/support-array-key-vals", optional = true }
4348

4449
[dev-dependencies]
4550
assert2 = "0.3.7"

benches/unescape.rs

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,28 @@ fn benchmarks(c: &mut Criterion) {
4343
let input = util::inputs::make_sample(128, entity, "a");
4444

4545
#[cfg(feature = "unescape")]
46+
util::benchmark_name!(
47+
group,
48+
"hashify",
49+
(Map::<Hashify>::default(), ContextGeneral),
50+
&name,
51+
&input
52+
);
53+
54+
#[cfg(feature = "unescape_phf")]
4655
util::benchmark_name!(
4756
group,
4857
"phf",
49-
(Phf, ContextGeneral),
58+
(Map::<PhfMap>::default(), ContextGeneral),
59+
&name,
60+
&input
61+
);
62+
63+
#[cfg(feature = "unescape_quick")]
64+
util::benchmark_name!(
65+
group,
66+
"quickphf",
67+
(Map::<QuickPhf>::default(), ContextGeneral),
5068
&name,
5169
&input
5270
);
@@ -67,10 +85,28 @@ fn benchmarks(c: &mut Criterion) {
6785
let input = util::inputs::make_sample(128, entity, "a");
6886

6987
#[cfg(feature = "unescape")]
88+
util::benchmark_name!(
89+
group,
90+
"hashify",
91+
(Map::<Hashify>::default(), ContextAttribute),
92+
&name,
93+
&input
94+
);
95+
96+
#[cfg(feature = "unescape_phf")]
7097
util::benchmark_name!(
7198
group,
7299
"phf",
73-
(Phf, ContextAttribute),
100+
(Map::<PhfMap>::default(), ContextAttribute),
101+
&name,
102+
&input
103+
);
104+
105+
#[cfg(feature = "unescape_quick")]
106+
util::benchmark_name!(
107+
group,
108+
"quickphf",
109+
(Map::<QuickPhf>::default(), ContextAttribute),
74110
&name,
75111
&input
76112
);

benches/unescape_iai.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@ macro_rules! iai_benchmarks {
1616
$(
1717
#[cfg(feature = "unescape")]
1818
fn [<iai_slow_unescape_ $name>]() -> Cow<'static, str> {
19-
unescape_in((Phf, ContextGeneral), black_box($input))
19+
unescape_in((Map::<PhfMap>::default(), ContextGeneral), black_box($input))
2020
}
2121

2222
#[cfg(feature = "unescape")]
2323
fn [<iai_slow_unescape_attribute_ $name>]() -> Cow<'static, str> {
24-
unescape_in((Phf, ContextAttribute), black_box($input))
24+
unescape_in((Map::<PhfMap>::default(), ContextAttribute), black_box($input))
2525
}
2626

2727
#[cfg(feature = "unescape_fast")]

build.rs

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ fn main() {
1414
#[cfg(any(
1515
feature = "unescape_fast",
1616
feature = "unescape",
17-
feature = "entities"
17+
feature = "entities",
18+
feature = "entities_quick"
1819
))]
1920
let entities = load_entities("entities.json");
2021

@@ -29,6 +30,9 @@ fn main() {
2930

3031
#[cfg(feature = "entities")]
3132
generate_entities_rs(&entities);
33+
34+
#[cfg(feature = "entities_quick")]
35+
generate_entities_quick_rs(&entities);
3236
}
3337

3438
/// Generate entities.rs file containing all valid HTML entities in a
@@ -94,6 +98,72 @@ fn generate_entities_rs(entities: &[(String, String)]) {
9498
.unwrap();
9599
}
96100

101+
/// Generate `entities_quick.rs` file containing all valid HTML entities in a
102+
/// [`quickphf::PhfMap`]. It also generates documentation with a table of all
103+
/// the entities and their expansions.
104+
#[cfg(feature = "entities_quick")]
105+
fn generate_entities_quick_rs(entities: &[(String, String)]) {
106+
use std::env;
107+
use std::fs::File;
108+
use std::io::{BufWriter, Write};
109+
use std::path::Path;
110+
111+
let out_path =
112+
Path::new(&env::var("OUT_DIR").unwrap()).join("entities_quick.rs");
113+
let mut out = BufWriter::new(File::create(out_path).unwrap());
114+
115+
writeln!(out, "\
116+
/// A map of all valid HTML entities to their expansions.\n\
117+
///\n\
118+
/// The keys of the map are full entity byte strings, e.g. `b\"&copy;\"`, and the\n\
119+
/// values are their expansions, e.g. `b\"©\"`.\n\
120+
///\n\
121+
/// See the [WHATWG HTML spec][spec] for the canonical list of entities with\n\
122+
/// their codepoints and glyphs. The [entities.json][] file linked there is\n\
123+
/// used to generate this constant.\n\
124+
///\n\
125+
/// [spec]: https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references\n\
126+
/// [entities.json]: https://html.spec.whatwg.org/entities.json\n\
127+
///\n\
128+
/// Entity | Codepoints | Glyph\n\
129+
/// -------------------------------|--------------------|------").unwrap();
130+
131+
let (keys, values): (Vec<_>, Vec<_>) = entities
132+
.iter()
133+
.inspect(|(name, glyph)| {
134+
// `{:28}` would pad the output inside the backticks.
135+
let name = format!("`{name}`");
136+
137+
let codepoints = glyph
138+
.chars()
139+
.map(|c| format!("U+{:06X}", u32::from(c)))
140+
.collect::<Vec<_>>()
141+
.join(", ");
142+
143+
// Suppress a few inconvenient glyphs. Newline adds an extra line, and
144+
// tab causes a clippy warning. Backticks are actually fine, but it’s
145+
// correct to escape them.
146+
let glyph = match glyph.as_str() {
147+
"\n" | "\t" => "",
148+
"`" => "\\`",
149+
v => v,
150+
};
151+
152+
writeln!(out, "/// {name:30} | {codepoints:18} | {glyph}",)
153+
.unwrap();
154+
})
155+
.map(|(name, glyph)| (name.as_bytes(), glyph.as_bytes()))
156+
.unzip();
157+
158+
writeln!(out, "#[allow(clippy::unreadable_literal)]").unwrap();
159+
writeln!(
160+
out,
161+
"pub static ENTITIES_QUICK: quickphf::PhfMap<&[u8], &[u8]> = {};",
162+
quickphf_codegen::build_map(&keys, &values),
163+
)
164+
.unwrap();
165+
}
166+
97167
/// Generate `entities_length.rs` file containing constants with the minimum
98168
/// and maximum entity lengths.
99169
#[cfg(any(feature = "unescape", feature = "entities"))]
@@ -152,7 +222,7 @@ fn generate_unescape_entity_rs(entities: &[(String, String)]) {
152222
/// Get expansion or `None` for a candidate HTML entity byte string.\n\
153223
#[must_use]\n\
154224
#[allow(clippy::too_many_lines)]\n\
155-
fn expand_entity(candidate: &[u8]) -> Option<&[u8]> {{\n\
225+
fn expand_entity(candidate: &[u8]) -> Option<&'static [u8]> {{\n\
156226
hashify::map! {{\n\
157227
candidate,\n\
158228
&[u8],"

src/lib.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,3 +153,13 @@ feature! {
153153
}
154154
pub use entities::*;
155155
}
156+
157+
feature! {
158+
#![feature = "entities_quick"]
159+
160+
/// For some reason `rustdoc` doesn’t show the feature flags without `mod`.
161+
mod entities_quick {
162+
include!(concat!(env!("OUT_DIR"), "/entities_quick.rs"));
163+
}
164+
pub use entities_quick::*;
165+
}

0 commit comments

Comments
 (0)