Skip to content

Commit 6d8ccb6

Browse files
committed
WIP: Split unescape and entities features.
1 parent 2d2ac79 commit 6d8ccb6

File tree

8 files changed

+158
-70
lines changed

8 files changed

+158
-70
lines changed

CHANGELOG.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,20 @@ All notable changes to this project will be documented in this file.
1313
* Clarify examples in documentation and README.
1414
* Fix a few spelling mistakes in documentation.
1515

16+
### Breaking changes
17+
18+
* `unescape`: Use [hashify] to map entity byte strings to their expansions. This
19+
is faster than the old [phf] map, but still slower than [matchgen] in
20+
`unescape_fast`. Thanks to [xamgore] for the PR!
21+
* The `unescape` feature no longer automatically enables the `entities` feature.
22+
If you need the `ENTITIES` map, enable the `entities` feature.
23+
* Updated minimum supported Rust version (MSRV) to 1.74.1 to support [hashify].
24+
25+
[hashify]: https://crates.io/crates/hashify
26+
[matchgen]: https://crates.io/crates/matchgen
27+
[phf]: https://crates.io/crates/phf
28+
[xamgore]: https://github.com/xamgore
29+
1630
## Release 1.0.6 (2025-04-26)
1731

1832
* Switch dependency from [paste], which is no longer maintained, to a new fork,

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "htmlize"
3-
version = "1.0.6"
3+
version = "2.0.0"
44
authors = ["Daniel Parks <oss-htmlize@demonhorse.org>"]
55
description = "Correctly encode and decode HTML entities in UTF-8"
66
homepage = "https://github.com/danielparks/htmlize"
@@ -11,15 +11,15 @@ keywords = ["html", "entities", "escape", "unescape", "decode"]
1111
categories = ["web-programming", "encoding"]
1212
license = "MIT OR Apache-2.0"
1313
edition = "2021"
14-
rust-version = "1.60"
14+
rust-version = "1.74.1"
1515

1616
[package.metadata.docs.rs]
1717
all-features = true
1818
rustdoc-args = ["--cfg", "docsrs"]
1919

2020
[features]
2121
default = []
22-
unescape = ["entities", "_unescape_either"]
22+
unescape = ["_unescape_either", "dep:hashify", "dep:serde_json"]
2323
unescape_fast = ["_unescape_either", "dep:matchgen", "dep:serde_json"]
2424
entities = ["dep:phf", "dep:phf_codegen", "dep:serde_json"]
2525
# Enable iai benchmarks
@@ -36,10 +36,10 @@ phf_codegen = { version = "0.11.1", optional = true }
3636
serde_json = { version = "1.0", optional = true }
3737

3838
[dependencies]
39+
hashify = { version = "0.2.6", optional = true }
3940
memchr = "2.5.0"
4041
pastey = "0.1.0"
4142
phf = { version = "0.11.1", default-features = false, optional = true }
42-
hashify = "0.2.6"
4343

4444
[dev-dependencies]
4545
assert2 = "0.3.7"

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
[![docs.rs](https://img.shields.io/docsrs/htmlize)][docs.rs]
44
[![Crates.io](https://img.shields.io/crates/v/htmlize)][crates.io]
5-
![Rust version 1.60+](https://img.shields.io/badge/Rust%20version-1.60%2B-success)
5+
![Rust version 1.74.1+](https://img.shields.io/badge/Rust%20version-1.74.1%2B-success)
66

77
Htmlize handles both encoding raw strings to be safely inserted in HTML, and
88
decoding HTML text with entities to get back a raw string. It closely follows
@@ -134,8 +134,8 @@ The `escape` functions are all available with no features enabled.
134134
performance of of the `unescape` version is already pretty good, so I don’t
135135
recommend enabling this unless you really need it.
136136

137-
* `unescape`: provide normal version of `unescape()`. This will
138-
automatically enable the `entities` feature.
137+
* `unescape`: provide normal version of `unescape()`. Enabling this will add a
138+
dependency on [hashify] and may slow builds by a few seconds.
139139

140140
* `entities`: build `ENTITIES` map. Enabling this will add a dependency
141141
on [phf] and may slow builds by a few seconds.
@@ -225,6 +225,7 @@ additional terms or conditions.
225225
[`unescape_bytes_in()`]: https://docs.rs/htmlize/1.0.6/htmlize/fn.unescape_bytes_in.html
226226
[`Cow`]: https://doc.rust-lang.org/std/borrow/enum.Cow.html
227227
[official WHATWG spec]: https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
228+
[hashify]: https://crates.io/crates/hashify
228229
[phf]: https://crates.io/crates/phf
229230
[features]: https://docs.rs/htmlize/1.0.6/htmlize/index.html#features
230231
[iai]: https://crates.io/crates/iai

build.rs

Lines changed: 108 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -11,22 +11,31 @@
1111
//! }
1212
1313
fn main() {
14-
#[cfg(any(feature = "unescape_fast", feature = "entities"))]
14+
#[cfg(any(
15+
feature = "unescape_fast",
16+
feature = "unescape",
17+
feature = "entities"
18+
))]
1519
let entities = load_entities("entities.json");
1620

1721
#[cfg(feature = "unescape_fast")]
1822
generate_matcher_rs(&entities);
1923

24+
#[cfg(feature = "unescape")]
25+
generate_unescape_entity_rs(&entities);
26+
27+
#[cfg(any(feature = "unescape", feature = "entities"))]
28+
generate_entities_length_rs(&entities);
29+
2030
#[cfg(feature = "entities")]
2131
generate_entities_rs(&entities);
2232
}
2333

2434
/// Generate entities.rs file containing all valid HTML entities in a
25-
/// [`phf::Map`] along with a few useful constants. It also generates
26-
/// documentation with all entities in a table.
35+
/// [`phf::Map`]. It also generates documentation with a table of all the
36+
/// entities and their expansions.
2737
#[cfg(feature = "entities")]
2838
fn generate_entities_rs(entities: &[(String, String)]) {
29-
use std::cmp::{max, min};
3039
use std::env;
3140
use std::fs::File;
3241
use std::io::{BufWriter, Write};
@@ -51,28 +60,9 @@ fn generate_entities_rs(entities: &[(String, String)]) {
5160
/// Entity | Codepoints | Glyph\n\
5261
/// -------------------------------|--------------------|------").unwrap();
5362

54-
let mut hashify = String::new();
55-
5663
let mut map_builder = phf_codegen::Map::<&[u8]>::new();
57-
let mut max_len: usize = 0;
58-
let mut min_len: usize = usize::MAX;
59-
let mut bare_max_len: usize = 0;
6064
for (name, glyph) in entities {
6165
map_builder.entry(name.as_bytes(), &format!("&{:?}", glyph.as_bytes()));
62-
max_len = max(max_len, name.len());
63-
min_len = min(min_len, name.len());
64-
if !name.ends_with(';') {
65-
bare_max_len = max(bare_max_len, name.len());
66-
}
67-
68-
{
69-
use std::fmt::Write;
70-
write!(&mut hashify, "\n b\"{name}\" => &[").unwrap();
71-
for &byte in glyph.as_bytes() {
72-
write!(&mut hashify, "{byte},").unwrap();
73-
}
74-
write!(&mut hashify, "],").unwrap();
75-
}
7666

7767
// `{:28}` would pad the output inside the backticks.
7868
let name = format!("`{name}`");
@@ -95,29 +85,98 @@ fn generate_entities_rs(entities: &[(String, String)]) {
9585
writeln!(out, "/// {name:30} | {codepoints:18} | {glyph}",).unwrap();
9686
}
9787

98-
let map = map_builder.build();
88+
writeln!(out, "#[allow(clippy::unreadable_literal)]").unwrap();
89+
writeln!(
90+
out,
91+
"pub static ENTITIES: phf::Map<&[u8], &[u8]> = {};",
92+
map_builder.build()
93+
)
94+
.unwrap();
95+
}
96+
97+
/// Generate `entities_length.rs` file containing constants with the minimum
98+
/// and maximum entity lengths.
99+
#[cfg(any(feature = "unescape", feature = "entities"))]
100+
fn generate_entities_length_rs(entities: &[(String, String)]) {
101+
use std::cmp::{max, min};
102+
use std::env;
103+
use std::fs::File;
104+
use std::io::{BufWriter, Write};
105+
use std::path::Path;
106+
107+
let out_path =
108+
Path::new(&env::var("OUT_DIR").unwrap()).join("entities_length.rs");
109+
let mut out = BufWriter::new(File::create(out_path).unwrap());
110+
111+
let mut max_len: usize = 0;
112+
let mut min_len: usize = usize::MAX;
113+
let mut bare_max_len: usize = 0;
114+
for (name, _) in entities {
115+
max_len = max(max_len, name.len());
116+
min_len = min(min_len, name.len());
117+
if !name.ends_with(';') {
118+
bare_max_len = max(bare_max_len, name.len());
119+
}
120+
}
121+
writeln!(
122+
out,
123+
"\
124+
/// Length of longest entity including ‘&’ and possibly ‘;’.\n\
125+
pub const ENTITY_MAX_LENGTH: usize = {max_len};\n\
126+
\n\
127+
/// Length of shortest entity including ‘&’ and possibly ‘;’.\n\
128+
pub const ENTITY_MIN_LENGTH: usize = {min_len};\n\
129+
\n\
130+
/// Length of longest semicolon-less entity including ‘&’.\n\
131+
pub const BARE_ENTITY_MAX_LENGTH: usize = {bare_max_len};"
132+
)
133+
.unwrap();
134+
}
135+
136+
/// Generate `expand_entity.rs` file containing a function that maps entity byte
137+
/// strings to their expansions.
138+
#[cfg(feature = "unescape")]
139+
fn generate_unescape_entity_rs(entities: &[(String, String)]) {
140+
use std::env;
141+
use std::fs::File;
142+
use std::io::{BufWriter, Write};
143+
use std::path::Path;
144+
145+
let out_path =
146+
Path::new(&env::var("OUT_DIR").unwrap()).join("expand_entity.rs");
147+
let mut out = BufWriter::new(File::create(out_path).unwrap());
148+
149+
writeln!(
150+
out,
151+
"\
152+
/// Get expansion or `None` for a candidate HTML entity byte string.\n\
153+
#[must_use]\n\
154+
#[allow(clippy::too_many_lines)]\n\
155+
fn expand_entity(candidate: &[u8]) -> Option<&[u8]> {{\n\
156+
hashify::map! {{\n\
157+
candidate,\n\
158+
&[u8],"
159+
)
160+
.unwrap();
161+
162+
for (name, glyph) in entities {
163+
write!(
164+
out,
165+
"\n\
166+
b\"{name}\" => &["
167+
)
168+
.unwrap();
169+
for &byte in glyph.as_bytes() {
170+
write!(out, "{byte},").unwrap();
171+
}
172+
write!(out, "],").unwrap();
173+
}
174+
99175
writeln!(
100176
out,
101-
r#"#[allow(clippy::unreadable_literal)]
102-
pub static ENTITIES: phf::Map<&[u8], &[u8]> = {map};
103-
104-
/// Length of longest entity including ‘&’ and possibly ‘;’.
105-
pub const ENTITY_MAX_LENGTH: usize = {max_len};
106-
107-
/// Length of shortest entity including ‘&’ and possibly ‘;’.
108-
pub const ENTITY_MIN_LENGTH: usize = {min_len};
109-
110-
/// Length of longest semicolon-less entity including ‘&’.
111-
pub const BARE_ENTITY_MAX_LENGTH: usize = {bare_max_len};
112-
113-
/// Get an unescaped character by its HTML entity
114-
pub(crate) fn get_entity(candidate: &[u8]) -> Option<&[u8]> {{
115-
hashify::map! {{
116-
candidate,
117-
&[u8],{hashify}
118-
}}
119-
}}
120-
"#
177+
"\n\
178+
}}\n\
179+
}}"
121180
)
122181
.unwrap();
123182
}
@@ -146,7 +205,11 @@ fn generate_matcher_rs(entities: &[(String, String)]) {
146205
}
147206

148207
/// Load HTML entities as `vec![...("&gt;", ">")...]`.
149-
#[cfg(any(feature = "unescape_fast", feature = "entities"))]
208+
#[cfg(any(
209+
feature = "unescape_fast",
210+
feature = "unescape",
211+
feature = "entities"
212+
))]
150213
fn load_entities<P: AsRef<std::path::Path>>(path: P) -> Vec<(String, String)> {
151214
let input = std::fs::read(path.as_ref()).unwrap();
152215
let input: serde_json::Map<String, serde_json::Value> =

src/entities.rs

Lines changed: 0 additions & 5 deletions
This file was deleted.

src/lib.rs

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ assert!(htmlize::unescape("3 &times 4 &gt; 10") == "3 × 4 > 10");
6767
//! performance of of the `unescape` version is already pretty good, so I
6868
//! don’t recommend enabling this unless you really need it.
6969
//!
70-
//! * `unescape`: provide normal version of [`unescape()`]. This will
71-
//! automatically enable the `entities` feature.
70+
//! * `unescape`: provide normal version of [`unescape()`]. Enabling this will
71+
//! add a dependency on [hashify] and may slow builds by a few seconds.
7272
//!
7373
//! * `entities`: build [`ENTITIES`] map. Enabling this will add a dependency
7474
//! on [phf] and may slow builds by a few seconds.
@@ -89,10 +89,11 @@ assert!(htmlize::unescape("3 &times 4 &gt; 10") == "3 × 4 > 10");
8989
//!
9090
//! # Minimum supported Rust version
9191
//!
92-
//! Currently the minimum supported Rust version (MSRV) is **1.60**. Future
92+
//! Currently the minimum supported Rust version (MSRV) is **1.74.1**. Future
9393
//! increases in the MSRV will require a major version bump.
9494
//!
9595
//! [official WHATWG spec]: https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
96+
//! [hashify]: https://crates.io/crates/hashify
9697
//! [phf]: https://crates.io/crates/phf
9798
//! [iai]: https://crates.io/crates/iai
9899
//! [benchmarks]: https://github.com/danielparks/htmlize#benchmarks
@@ -133,9 +134,22 @@ feature! {
133134
pub use unescape::*;
134135
}
135136

137+
feature! {
138+
#![any(feature = "unescape", feature = "entities")]
139+
140+
/// For some reason `rustdoc` doesn’t show the feature flags without `mod`.
141+
mod entities_length {
142+
include!(concat!(env!("OUT_DIR"), "/entities_length.rs"));
143+
}
144+
pub use entities_length::*;
145+
}
146+
136147
feature! {
137148
#![feature = "entities"]
138149

139-
mod entities;
150+
/// For some reason `rustdoc` doesn’t show the feature flags without `mod`.
151+
mod entities {
152+
include!(concat!(env!("OUT_DIR"), "/entities.rs"));
153+
}
140154
pub use entities::*;
141155
}

0 commit comments

Comments
 (0)