@@ -4,6 +4,7 @@ use memchr::{memchr, memchr2_iter, memchr3};
44use std:: borrow:: Cow ;
55use std:: num:: ParseIntError ;
66use std:: ops:: Range ;
7+ use std:: slice:: Iter ;
78
89/// Error of parsing character reference (`&#<dec-number>;` or `&#x<hex-number>;`).
910#[ derive( Clone , Debug , PartialEq ) ]
@@ -50,6 +51,12 @@ pub enum EscapeError {
5051 /// Attempt to parse character reference (`&#<dec-number>;` or `&#x<hex-number>;`)
5152 /// was unsuccessful, not all characters are decimal or hexadecimal numbers.
5253 InvalidCharRef ( ParseCharRefError ) ,
54+ /// Expanded more than maximum possible entities during attribute normalization.
55+ ///
56+ /// Attribute normalization includes expanding of general entities (`&entity;`)
57+ /// which replacement text also could contain entities, which is also must be expanded.
58+ /// If more than 128 entities would be expanded, this error is returned.
59+ TooManyNestedEntities ,
5360}
5461
5562impl std:: fmt:: Display for EscapeError {
@@ -66,6 +73,9 @@ impl std::fmt::Display for EscapeError {
6673 Self :: InvalidCharRef ( e) => {
6774 write ! ( f, "invalid character reference: {}" , e)
6875 }
76+ Self :: TooManyNestedEntities => {
77+ f. write_str ( "too many nested entities in an attribute value" )
78+ }
6979 }
7080 }
7181}
@@ -489,6 +499,219 @@ fn normalize_html_eol_step(normalized: &mut String, input: &[u8], index: usize,
489499
490500////////////////////////////////////////////////////////////////////////////////////////////////////
491501
502+ const fn is_normalization_char ( b : & u8 ) -> bool {
503+ // The following sequences should be translated into a single `\n` (U+000a) character
504+ // to normalize EOLs:
505+ //
506+ // |UTF-8 |String|
507+ // |--------|------|
508+ // |0d 0a |\r\n |
509+ // |0d c2 85|\r\x85|
510+ // |0d |\r |
511+ // |c2 85 |\x85 |
512+ // |e2 80 a8|\x2028|
513+ matches ! ( * b, b'\t' | b'\r' | b'\n' | 0xC2 | 0xE2 | b'&' )
514+ }
515+
516+ /// Returns the attribute value normalized as per [the XML specification],
517+ /// using a custom entity resolver.
518+ ///
519+ /// Do not use this method with HTML attributes.
520+ ///
521+ /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>`
522+ /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
523+ /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
524+ /// take precedence.
525+ ///
526+ /// This will allocate unless the raw attribute value does not require normalization.
527+ ///
528+ /// # Parameters
529+ ///
530+ /// - `value`: unnormalized attribute value
531+ /// - `depth`: maximum number of nested entities that can be expanded. If expansion
532+ /// chain will be more that this value, the function will return [`EscapeError::TooManyNestedEntities`]
533+ /// - `resolve_entity`: a function to resolve entity. This function could be called
534+ /// multiple times on the same input and can return different values in each case
535+ /// for the same input, although it is not recommended
536+ ///
537+ /// # Lifetimes
538+ ///
539+ /// - `'input`: lifetime of the unnormalized attribute. If normalization is not requred,
540+ /// the input returned unchanged with the same lifetime
541+ /// - `'entity`: lifetime of all entities that is returned by the entity resolution routine
542+ ///
543+ /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
544+ pub ( crate ) fn normalize_attribute_value < ' input , ' entity , F > (
545+ value : & ' input str ,
546+ depth : usize ,
547+ mut resolve_entity : F ,
548+ ) -> Result < Cow < ' input , str > , EscapeError >
549+ where
550+ // the lifetime of the output comes from a capture or is `'static`
551+ F : FnMut ( & str ) -> Option < & ' entity str > ,
552+ {
553+ let mut iter = value. as_bytes ( ) . iter ( ) ;
554+
555+ // If we found the charater that requires normalization, create a normalized
556+ // version of the attribute, otherwise return the value unchanged
557+ if let Some ( i) = iter. position ( is_normalization_char) {
558+ let mut normalized = String :: with_capacity ( value. len ( ) ) ;
559+ let pos = normalize_attribute_step (
560+ & mut normalized,
561+ & mut iter,
562+ value,
563+ 0 ,
564+ i,
565+ depth,
566+ & mut resolve_entity,
567+ ) ?;
568+
569+ normalize_attribute_steps (
570+ & mut normalized,
571+ & mut iter,
572+ value,
573+ pos,
574+ depth,
575+ & mut resolve_entity,
576+ ) ?;
577+ return Ok ( normalized. into ( ) ) ;
578+ }
579+ Ok ( Cow :: Borrowed ( value) )
580+ }
581+
582+ fn normalize_attribute_steps < ' entity , F > (
583+ normalized : & mut String ,
584+ iter : & mut Iter < u8 > ,
585+ input : & str ,
586+ mut pos : usize ,
587+ depth : usize ,
588+ resolve_entity : & mut F ,
589+ ) -> Result < ( ) , EscapeError >
590+ where
591+ // the lifetime of the output comes from a capture or is `'static`
592+ F : FnMut ( & str ) -> Option < & ' entity str > ,
593+ {
594+ while let Some ( i) = iter. position ( is_normalization_char) {
595+ pos =
596+ normalize_attribute_step ( normalized, iter, input, pos, pos + i, depth, resolve_entity) ?;
597+ }
598+ if let Some ( rest) = input. get ( pos..) {
599+ normalized. push_str ( rest) ;
600+ }
601+ Ok ( ( ) )
602+ }
603+
604+ /// Performs one step of the [normalization algorithm] (but with recursive part):
605+ ///
606+ /// 1. For a character reference, append the referenced character
607+ /// to the normalized value.
608+ /// 2. For an entity reference, recursively apply this algorithm
609+ /// to the replacement text of the entity.
610+ /// 3. For a white space character (#x20, #xD, #xA, #x9), append
611+ /// a space character (#x20) to the normalized value.
612+ /// 4. For another character, append the character to the normalized value.
613+ ///
614+ /// Because [according to the specification], XML parser should parse line-of-end
615+ /// normalized input, but quick-xml does not do that, this function also performs
616+ /// normalization of EOL characters. That should be done before expanding entities
617+ /// and character references, so cannot be processed later.
618+ ///
619+ /// This function could be used also just to normalize line ends if the iterator
620+ /// won't be stop on `&` characters.
621+ ///
622+ /// # Parameters
623+ ///
624+ /// - `normalized`: Output of the algorithm. Normalized value will be placed here
625+ /// - `iter`: Iterator over bytes of `input`
626+ /// - `input`: Original non-normalized value
627+ /// - `last_pos`: Index of the last byte in `input` that was processed
628+ /// - `index`: Index of the byte in `input` that should be processed now
629+ /// - `seen_cr`: `\r\n` and `\r\x85` sequences should be normalized into one space
630+ /// so this parameter tracks if we seen the `\r` before processing the current byte
631+ /// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm
632+ /// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities
633+ ///
634+ /// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize
635+ /// [according to the specification]: https://www.w3.org/TR/xml11/#sec-line-ends
636+ fn normalize_attribute_step < ' entity , F > (
637+ normalized : & mut String ,
638+ iter : & mut Iter < u8 > ,
639+ input : & str ,
640+ last_pos : usize ,
641+ index : usize ,
642+ depth : usize ,
643+ resolve_entity : & mut F ,
644+ ) -> Result < usize , EscapeError >
645+ where
646+ // the lifetime of the output comes from a capture or is `'static`
647+ F : FnMut ( & str ) -> Option < & ' entity str > ,
648+ {
649+ if depth == 0 {
650+ return Err ( EscapeError :: TooManyNestedEntities ) ;
651+ }
652+ // 4. For another character, append the character to the normalized value.
653+ normalized. push_str ( & input[ last_pos..index] ) ;
654+
655+ match input. as_bytes ( ) [ index] {
656+ b'&' => {
657+ let start = index + 1 ; // +1 - skip `&`
658+ let end = start
659+ + match iter. position ( |& b| b == b';' ) {
660+ Some ( end) => end,
661+ None => return Err ( EscapeError :: UnterminatedEntity ( index..input. len ( ) ) ) ,
662+ } ;
663+
664+ // Content between & and ; - &pat;
665+ // Note, that this content have non-normalized EOLs as required by the specification,
666+ // but because numbers in any case cannot have spaces inside, this is not the problem.
667+ // Normalization of spaces in entity references and checking that they corresponds to
668+ // [`Name`] production on conscience `resolve_entity`.
669+ //
670+ // [`Name`]: https://www.w3.org/TR/xml11/#NT-Name
671+ let pat = & input[ start..end] ;
672+ // 1. For a character reference, append the referenced character
673+ // to the normalized value.
674+ if pat. starts_with ( '#' ) {
675+ let entity = & pat[ 1 ..] ; // starts after the #
676+ let codepoint = parse_number ( entity) . map_err ( EscapeError :: InvalidCharRef ) ?;
677+ normalized. push_str ( codepoint. encode_utf8 ( & mut [ 0u8 ; 4 ] ) ) ;
678+ } else
679+ // 2. For an entity reference, recursively apply this algorithm
680+ // to the replacement text of the entity.
681+ if let Some ( value) = resolve_entity ( pat) {
682+ normalize_attribute_steps (
683+ normalized,
684+ & mut value. as_bytes ( ) . iter ( ) ,
685+ value,
686+ 0 ,
687+ depth. saturating_sub ( 1 ) ,
688+ resolve_entity,
689+ ) ?;
690+ } else {
691+ return Err ( EscapeError :: UnrecognizedEntity ( start..end, pat. to_string ( ) ) ) ;
692+ }
693+ Ok ( end + 1 ) // +1 - skip `;`
694+ }
695+ // 3. For a white space character (#x20, #xD, #xA, #x9), append
696+ // a space character (#x20) to the normalized value.
697+ // Space character has no special meaning, so it is handled on step 4
698+ b'\t' => {
699+ normalized. push ( ' ' ) ;
700+ Ok ( index + 1 ) // +1 - skip \t
701+ }
702+ _ => {
703+ let pos = normalize_xml_eol_step ( normalized, input. as_bytes ( ) , index, ' ' ) ;
704+ // We should advance iterator because we may skip several characters
705+ for _ in 0 ..pos - index - 1 {
706+ iter. next ( ) ;
707+ }
708+ Ok ( pos)
709+ }
710+ }
711+ }
712+
713+ ////////////////////////////////////////////////////////////////////////////////////////////////////
714+
492715/// Resolves predefined XML entities or all HTML5 entities depending on the feature
493716/// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html).
494717///
@@ -2154,4 +2377,119 @@ mod normalization {
21542377 }
21552378 }
21562379 }
2380+
2381+ mod attribute {
2382+ use super :: * ;
2383+ use pretty_assertions:: assert_eq;
2384+
2385+ #[ test]
2386+ fn empty ( ) {
2387+ assert_eq ! (
2388+ normalize_attribute_value( "" , 5 , |_| { None } ) ,
2389+ Ok ( "" . into( ) )
2390+ ) ;
2391+ }
2392+
2393+ #[ test]
2394+ fn only_spaces ( ) {
2395+ assert_eq ! (
2396+ normalize_attribute_value( " " , 5 , |_| { None } ) ,
2397+ Ok ( " " . into( ) )
2398+ ) ;
2399+ assert_eq ! (
2400+ normalize_attribute_value( "\t \t \t " , 5 , |_| { None } ) ,
2401+ Ok ( " " . into( ) )
2402+ ) ;
2403+ assert_eq ! (
2404+ normalize_attribute_value( "\r \r \r " , 5 , |_| { None } ) ,
2405+ Ok ( " " . into( ) )
2406+ ) ;
2407+ assert_eq ! (
2408+ normalize_attribute_value( "\n \n \n " , 5 , |_| { None } ) ,
2409+ Ok ( " " . into( ) )
2410+ ) ;
2411+ }
2412+
2413+ #[ test]
2414+ fn already_normalized ( ) {
2415+ assert_eq ! (
2416+ normalize_attribute_value( "already normalized" , 5 , |_| { None } ) ,
2417+ Ok ( "already normalized" . into( ) )
2418+ ) ;
2419+ }
2420+
2421+ #[ test]
2422+ fn characters ( ) {
2423+ assert_eq ! (
2424+ normalize_attribute_value( "string with   character" , 5 , |_| { None } ) ,
2425+ Ok ( "string with character" . into( ) )
2426+ ) ;
2427+ assert_eq ! (
2428+ normalize_attribute_value( "string with   character" , 5 , |_| { None } ) ,
2429+ Ok ( "string with character" . into( ) )
2430+ ) ;
2431+ }
2432+
2433+ #[ test]
2434+ fn entities ( ) {
2435+ assert_eq ! (
2436+ normalize_attribute_value( "string with &entity; reference" , 5 , |_| {
2437+ Some ( "replacement" )
2438+ } ) ,
2439+ Ok ( "string with replacement reference" . into( ) )
2440+ ) ;
2441+ assert_eq ! (
2442+ normalize_attribute_value( "string with &entity-1; reference" , 5 , |entity| {
2443+ match entity {
2444+ "entity-1" => Some ( "recursive &entity-2;" ) ,
2445+ "entity-2" => Some ( "entity 2" ) ,
2446+ _ => None ,
2447+ }
2448+ } ) ,
2449+ Ok ( "string with recursive entity 2 reference" . into( ) )
2450+ ) ;
2451+ }
2452+
2453+ #[ test]
2454+ fn unclosed_entity ( ) {
2455+ assert_eq ! (
2456+ normalize_attribute_value( "string with unclosed &entity reference" , 5 , |_| {
2457+ // 0 ^ = 21 ^ = 38
2458+ Some ( "replacement" )
2459+ } ) ,
2460+ Err ( EscapeError :: UnterminatedEntity ( 21 ..38 ) )
2461+ ) ;
2462+ assert_eq ! (
2463+ normalize_attribute_value(
2464+ "string with unclosed   (character) reference" ,
2465+ // ^ = 21 ^ = 47
2466+ 5 ,
2467+ |_| { None }
2468+ ) ,
2469+ Err ( EscapeError :: UnterminatedEntity ( 21 ..47 ) )
2470+ ) ;
2471+ }
2472+
2473+ #[ test]
2474+ fn unknown_entity ( ) {
2475+ assert_eq ! (
2476+ normalize_attribute_value( "string with unknown &entity; reference" , 5 , |_| {
2477+ // 0 ^ ^ = 21..27
2478+ None
2479+ } ) ,
2480+ Err ( EscapeError :: UnrecognizedEntity (
2481+ 21 ..27 ,
2482+ "entity" . to_string( ) ,
2483+ ) )
2484+ ) ;
2485+ }
2486+
2487+ #[ test]
2488+ fn recursive_entity ( ) {
2489+ assert_eq ! (
2490+ normalize_attribute_value( "&entity; reference" , 5 , |_| Some ( "recursive &entity;" ) ) ,
2491+ Err ( EscapeError :: TooManyNestedEntities ) ,
2492+ ) ;
2493+ }
2494+ }
21572495}
0 commit comments