Skip to content

Commit 591ebd8

Browse files
committed
Implement an attribute normalization routine as described in "3.3.3 Attribute-Value Normalization" section of XML 1.1. spec
https://www.w3.org/TR/xml11/#AVNormalize
1 parent 085c142 commit 591ebd8

File tree

2 files changed

+341
-0
lines changed

2 files changed

+341
-0
lines changed

Changelog.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@
3333

3434
### Misc Changes
3535

36+
- [#371]: New error variant `EscapeError::TooManyNestedEntities` was added.
37+
38+
[#371]: https://github.com/tafia/quick-xml/issues/371
3639
[#806]: https://github.com/tafia/quick-xml/issues/806
3740
[#878]: https://github.com/tafia/quick-xml/pull/878
3841
[#882]: https://github.com/tafia/quick-xml/pull/882

src/escape.rs

Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use memchr::{memchr, memchr2_iter, memchr3};
44
use std::borrow::Cow;
55
use std::num::ParseIntError;
66
use std::ops::Range;
7+
use std::slice::Iter;
78

89
/// Error of parsing character reference (`&#<dec-number>;` or `&#x<hex-number>;`).
910
#[derive(Clone, Debug, PartialEq)]
@@ -50,6 +51,12 @@ pub enum EscapeError {
5051
/// Attempt to parse character reference (`&#<dec-number>;` or `&#x<hex-number>;`)
5152
/// was unsuccessful, not all characters are decimal or hexadecimal numbers.
5253
InvalidCharRef(ParseCharRefError),
54+
/// Expanded more than maximum possible entities during attribute normalization.
55+
///
56+
/// Attribute normalization includes expanding of general entities (`&entity;`)
57+
/// which replacement text also could contain entities, which is also must be expanded.
58+
/// If more than 128 entities would be expanded, this error is returned.
59+
TooManyNestedEntities,
5360
}
5461

5562
impl std::fmt::Display for EscapeError {
@@ -66,6 +73,9 @@ impl std::fmt::Display for EscapeError {
6673
Self::InvalidCharRef(e) => {
6774
write!(f, "invalid character reference: {}", e)
6875
}
76+
Self::TooManyNestedEntities => {
77+
f.write_str("too many nested entities in an attribute value")
78+
}
6979
}
7080
}
7181
}
@@ -489,6 +499,219 @@ fn normalize_html_eol_step(normalized: &mut String, input: &[u8], index: usize,
489499

490500
////////////////////////////////////////////////////////////////////////////////////////////////////
491501

502+
const fn is_normalization_char(b: &u8) -> bool {
503+
// The following sequences should be translated into a single `\n` (U+000a) character
504+
// to normalize EOLs:
505+
//
506+
// |UTF-8 |String|
507+
// |--------|------|
508+
// |0d 0a |\r\n |
509+
// |0d c2 85|\r\x85|
510+
// |0d |\r |
511+
// |c2 85 |\x85 |
512+
// |e2 80 a8|\x2028|
513+
matches!(*b, b'\t' | b'\r' | b'\n' | 0xC2 | 0xE2 | b'&')
514+
}
515+
516+
/// Returns the attribute value normalized as per [the XML specification],
517+
/// using a custom entity resolver.
518+
///
519+
/// Do not use this method with HTML attributes.
520+
///
521+
/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
522+
/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
523+
/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
524+
/// take precedence.
525+
///
526+
/// This will allocate unless the raw attribute value does not require normalization.
527+
///
528+
/// # Parameters
529+
///
530+
/// - `value`: unnormalized attribute value
531+
/// - `depth`: maximum number of nested entities that can be expanded. If expansion
532+
/// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`]
533+
/// - `resolve_entity`: a function to resolve entity. This function could be called
534+
/// multiple times on the same input and can return different values in each case
535+
/// for the same input, although it is not recommended
536+
///
537+
/// # Lifetimes
538+
///
539+
/// - `'input`: lifetime of the unnormalized attribute. If normalization is not requred,
540+
/// the input returned unchanged with the same lifetime
541+
/// - `'entity`: lifetime of all entities that is returned by the entity resolution routine
542+
///
543+
/// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
544+
pub(crate) fn normalize_attribute_value<'input, 'entity, F>(
545+
value: &'input str,
546+
depth: usize,
547+
mut resolve_entity: F,
548+
) -> Result<Cow<'input, str>, EscapeError>
549+
where
550+
// the lifetime of the output comes from a capture or is `'static`
551+
F: FnMut(&str) -> Option<&'entity str>,
552+
{
553+
let mut iter = value.as_bytes().iter();
554+
555+
// If we found the charater that requires normalization, create a normalized
556+
// version of the attribute, otherwise return the value unchanged
557+
if let Some(i) = iter.position(is_normalization_char) {
558+
let mut normalized = String::with_capacity(value.len());
559+
let pos = normalize_attribute_step(
560+
&mut normalized,
561+
&mut iter,
562+
value,
563+
0,
564+
i,
565+
depth,
566+
&mut resolve_entity,
567+
)?;
568+
569+
normalize_attribute_steps(
570+
&mut normalized,
571+
&mut iter,
572+
value,
573+
pos,
574+
depth,
575+
&mut resolve_entity,
576+
)?;
577+
return Ok(normalized.into());
578+
}
579+
Ok(Cow::Borrowed(value))
580+
}
581+
582+
fn normalize_attribute_steps<'entity, F>(
583+
normalized: &mut String,
584+
iter: &mut Iter<u8>,
585+
input: &str,
586+
mut pos: usize,
587+
depth: usize,
588+
resolve_entity: &mut F,
589+
) -> Result<(), EscapeError>
590+
where
591+
// the lifetime of the output comes from a capture or is `'static`
592+
F: FnMut(&str) -> Option<&'entity str>,
593+
{
594+
while let Some(i) = iter.position(is_normalization_char) {
595+
pos =
596+
normalize_attribute_step(normalized, iter, input, pos, pos + i, depth, resolve_entity)?;
597+
}
598+
if let Some(rest) = input.get(pos..) {
599+
normalized.push_str(rest);
600+
}
601+
Ok(())
602+
}
603+
604+
/// Performs one step of the [normalization algorithm] (but with recursive part):
605+
///
606+
/// 1. For a character reference, append the referenced character
607+
/// to the normalized value.
608+
/// 2. For an entity reference, recursively apply this algorithm
609+
/// to the replacement text of the entity.
610+
/// 3. For a white space character (#x20, #xD, #xA, #x9), append
611+
/// a space character (#x20) to the normalized value.
612+
/// 4. For another character, append the character to the normalized value.
613+
///
614+
/// Because [according to the specification], XML parser should parse line-of-end
615+
/// normalized input, but quick-xml does not do that, this function also performs
616+
/// normalization of EOL characters. That should be done before expanding entities
617+
/// and character references, so cannot be processed later.
618+
///
619+
/// This function could be used also just to normalize line ends if the iterator
620+
/// won't be stop on `&` characters.
621+
///
622+
/// # Parameters
623+
///
624+
/// - `normalized`: Output of the algorithm. Normalized value will be placed here
625+
/// - `iter`: Iterator over bytes of `input`
626+
/// - `input`: Original non-normalized value
627+
/// - `last_pos`: Index of the last byte in `input` that was processed
628+
/// - `index`: Index of the byte in `input` that should be processed now
629+
/// - `seen_cr`: `\r\n` and `\r\x85` sequences should be normalized into one space
630+
/// so this parameter tracks if we seen the `\r` before processing the current byte
631+
/// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm
632+
/// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities
633+
///
634+
/// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize
635+
/// [according to the specification]: https://www.w3.org/TR/xml11/#sec-line-ends
636+
fn normalize_attribute_step<'entity, F>(
637+
normalized: &mut String,
638+
iter: &mut Iter<u8>,
639+
input: &str,
640+
last_pos: usize,
641+
index: usize,
642+
depth: usize,
643+
resolve_entity: &mut F,
644+
) -> Result<usize, EscapeError>
645+
where
646+
// the lifetime of the output comes from a capture or is `'static`
647+
F: FnMut(&str) -> Option<&'entity str>,
648+
{
649+
if depth == 0 {
650+
return Err(EscapeError::TooManyNestedEntities);
651+
}
652+
// 4. For another character, append the character to the normalized value.
653+
normalized.push_str(&input[last_pos..index]);
654+
655+
match input.as_bytes()[index] {
656+
b'&' => {
657+
let start = index + 1; // +1 - skip `&`
658+
let end = start
659+
+ match iter.position(|&b| b == b';') {
660+
Some(end) => end,
661+
None => return Err(EscapeError::UnterminatedEntity(index..input.len())),
662+
};
663+
664+
// Content between & and ; - &pat;
665+
// Note, that this content have non-normalized EOLs as required by the specification,
666+
// but because numbers in any case cannot have spaces inside, this is not the problem.
667+
// Normalization of spaces in entity references and checking that they corresponds to
668+
// [`Name`] production on conscience `resolve_entity`.
669+
//
670+
// [`Name`]: https://www.w3.org/TR/xml11/#NT-Name
671+
let pat = &input[start..end];
672+
// 1. For a character reference, append the referenced character
673+
// to the normalized value.
674+
if pat.starts_with('#') {
675+
let entity = &pat[1..]; // starts after the #
676+
let codepoint = parse_number(entity).map_err(EscapeError::InvalidCharRef)?;
677+
normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
678+
} else
679+
// 2. For an entity reference, recursively apply this algorithm
680+
// to the replacement text of the entity.
681+
if let Some(value) = resolve_entity(pat) {
682+
normalize_attribute_steps(
683+
normalized,
684+
&mut value.as_bytes().iter(),
685+
value,
686+
0,
687+
depth.saturating_sub(1),
688+
resolve_entity,
689+
)?;
690+
} else {
691+
return Err(EscapeError::UnrecognizedEntity(start..end, pat.to_string()));
692+
}
693+
Ok(end + 1) // +1 - skip `;`
694+
}
695+
// 3. For a white space character (#x20, #xD, #xA, #x9), append
696+
// a space character (#x20) to the normalized value.
697+
// Space character has no special meaning, so it is handled on step 4
698+
b'\t' => {
699+
normalized.push(' ');
700+
Ok(index + 1) // +1 - skip \t
701+
}
702+
_ => {
703+
let pos = normalize_xml_eol_step(normalized, input.as_bytes(), index, ' ');
704+
// We should advance iterator because we may skip several characters
705+
for _ in 0..pos - index - 1 {
706+
iter.next();
707+
}
708+
Ok(pos)
709+
}
710+
}
711+
}
712+
713+
////////////////////////////////////////////////////////////////////////////////////////////////////
714+
492715
/// Resolves predefined XML entities or all HTML5 entities depending on the feature
493716
/// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html).
494717
///
@@ -2154,4 +2377,119 @@ mod normalization {
21542377
}
21552378
}
21562379
}
2380+
2381+
mod attribute {
2382+
use super::*;
2383+
use pretty_assertions::assert_eq;
2384+
2385+
#[test]
2386+
fn empty() {
2387+
assert_eq!(
2388+
normalize_attribute_value("", 5, |_| { None }),
2389+
Ok("".into())
2390+
);
2391+
}
2392+
2393+
#[test]
2394+
fn only_spaces() {
2395+
assert_eq!(
2396+
normalize_attribute_value(" ", 5, |_| { None }),
2397+
Ok(" ".into())
2398+
);
2399+
assert_eq!(
2400+
normalize_attribute_value("\t\t\t", 5, |_| { None }),
2401+
Ok(" ".into())
2402+
);
2403+
assert_eq!(
2404+
normalize_attribute_value("\r\r\r", 5, |_| { None }),
2405+
Ok(" ".into())
2406+
);
2407+
assert_eq!(
2408+
normalize_attribute_value("\n\n\n", 5, |_| { None }),
2409+
Ok(" ".into())
2410+
);
2411+
}
2412+
2413+
#[test]
2414+
fn already_normalized() {
2415+
assert_eq!(
2416+
normalize_attribute_value("already normalized", 5, |_| { None }),
2417+
Ok("already normalized".into())
2418+
);
2419+
}
2420+
2421+
#[test]
2422+
fn characters() {
2423+
assert_eq!(
2424+
normalize_attribute_value("string with &#32; character", 5, |_| { None }),
2425+
Ok("string with character".into())
2426+
);
2427+
assert_eq!(
2428+
normalize_attribute_value("string with &#x20; character", 5, |_| { None }),
2429+
Ok("string with character".into())
2430+
);
2431+
}
2432+
2433+
#[test]
2434+
fn entities() {
2435+
assert_eq!(
2436+
normalize_attribute_value("string with &entity; reference", 5, |_| {
2437+
Some("replacement")
2438+
}),
2439+
Ok("string with replacement reference".into())
2440+
);
2441+
assert_eq!(
2442+
normalize_attribute_value("string with &entity-1; reference", 5, |entity| {
2443+
match entity {
2444+
"entity-1" => Some("recursive &entity-2;"),
2445+
"entity-2" => Some("entity&#32;2"),
2446+
_ => None,
2447+
}
2448+
}),
2449+
Ok("string with recursive entity 2 reference".into())
2450+
);
2451+
}
2452+
2453+
#[test]
2454+
fn unclosed_entity() {
2455+
assert_eq!(
2456+
normalize_attribute_value("string with unclosed &entity reference", 5, |_| {
2457+
// 0 ^ = 21 ^ = 38
2458+
Some("replacement")
2459+
}),
2460+
Err(EscapeError::UnterminatedEntity(21..38))
2461+
);
2462+
assert_eq!(
2463+
normalize_attribute_value(
2464+
"string with unclosed &#32 (character) reference",
2465+
// ^ = 21 ^ = 47
2466+
5,
2467+
|_| { None }
2468+
),
2469+
Err(EscapeError::UnterminatedEntity(21..47))
2470+
);
2471+
}
2472+
2473+
#[test]
2474+
fn unknown_entity() {
2475+
assert_eq!(
2476+
normalize_attribute_value("string with unknown &entity; reference", 5, |_| {
2477+
// 0 ^ ^ = 21..27
2478+
None
2479+
}),
2480+
Err(EscapeError::UnrecognizedEntity(
2481+
21..27,
2482+
"entity".to_string(),
2483+
))
2484+
);
2485+
}
2486+
2487+
#[test]
2488+
fn recursive_entity() {
2489+
assert_eq!(
2490+
normalize_attribute_value("&entity; reference", 5, |_| Some("recursive &entity;")),
2491+
Err(EscapeError::TooManyNestedEntities),
2492+
);
2493+
}
2494+
}
21572495
}

0 commit comments

Comments
 (0)