diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 56ea0f705c2b8..1419e623a8246 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -658,6 +658,57 @@ public function get_unsupported_exception() { return $this->unsupported_exception; } + /** + * Progress through a document pausing on tags matching the provided CSS selector string. + * + * Example: + * + * $processor = WP_HTML_Processor::create_fragment( + * 'Example' + * ); + * while ( $processor->select( 'meta[property^="og:" i]' ) ) { + * // Loop is entered twice. + * var_dump( + * $processor->get_tag(), // string(4) "META" + * $processor->get_attribute( 'property' ), // string(7) "og:type" / string(14) "og:description" + * $processor->get_attribute( 'content' ), // string(7) "website" / string(11) "An example." + * ); + * } + * + * @since {WP_VERSION} + * + * @param string $selector_string Selector string. + * @return bool Whether a selection was found. + */ + public function select( $selector_string ): bool { + static $previous_selector_string = null; + static $previous_selector = null; + + $selector = $selector_string === $previous_selector_string + ? $previous_selector + : WP_CSS_Complex_Selector_List::from_selectors( $selector_string ); + + $previous_selector = $selector; + $previous_selector_string = $selector_string; + + if ( null === $selector ) { + _doing_it_wrong( + __METHOD__, + sprintf( 'Received unsupported or invalid selector "%s".', $selector_string ), + '{WP_VERSION}' + ); + return false; + } + + while ( $this->next_tag() ) { + if ( $selector->matches( $this ) ) { + return true; + } + } + + return false; + } + /** * Finds the next tag matching the $query. * diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 501a623afb10b..9e0b22eb15bc3 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -537,6 +537,10 @@ class WP_HTML_Tag_Processor { */ protected $compat_mode = self::NO_QUIRKS_MODE; + public function is_quirks_mode() { + return self::QUIRKS_MODE === $this->compat_mode; + } + /** * Indicates whether the parser is inside foreign content, * e.g. inside an SVG or MathML element. @@ -864,6 +868,57 @@ public function change_parsing_namespace( string $new_namespace ): bool { return true; } + /** + * Progress through a document pausing on tags matching the provided CSS selector string. + * + * Example: + * + * $processor = new WP_HTML_Tag_Processor( + * 'Example' + * ); + * while ( $processor->select( 'meta[property^="og:" i]' ) ) { + * // Loop is entered twice. + * var_dump( + * $processor->get_tag(), // string(4) "META" + * $processor->get_attribute( 'property' ), // string(7) "og:type" / string(14) "og:description" + * $processor->get_attribute( 'content' ), // string(7) "website" / string(11) "An example." + * ); + * } + * + * @since {WP_VERSION} + * + * @param string $selector_string Selector string. + * @return bool Whether a selection was found. + */ + public function select( $selector_string ): bool { + static $previous_selector_string = null; + static $previous_selector = null; + + $selector = $selector_string === $previous_selector_string + ? $previous_selector + : WP_CSS_Compound_Selector_List::from_selectors( $selector_string ); + + $previous_selector = $selector; + $previous_selector_string = $selector_string; + + if ( null === $selector ) { + _doing_it_wrong( + __METHOD__, + sprintf( 'Received unsupported or invalid selector "%s".', $selector_string ), + '{WP_VERSION}' + ); + return false; + } + + while ( $this->next_tag() ) { + if ( $selector->matches( $this ) ) { + return true; + } + } + + return false; + } + /** * Finds the next tag matching the $query. * diff --git a/src/wp-includes/html-api/css/class-wp-css-attribute-selector.php b/src/wp-includes/html-api/css/class-wp-css-attribute-selector.php new file mode 100644 index 0000000000000..134e68104811f --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-attribute-selector.php @@ -0,0 +1,450 @@ + true, + 'accept-charset' => true, + 'align' => true, + 'alink' => true, + 'axis' => true, + 'bgcolor' => true, + 'charset' => true, + 'checked' => true, + 'clear' => true, + 'codetype' => true, + 'color' => true, + 'compact' => true, + 'declare' => true, + 'defer' => true, + 'dir' => true, + 'direction' => true, + 'disabled' => true, + 'enctype' => true, + 'face' => true, + 'frame' => true, + 'hreflang' => true, + 'http-equiv' => true, + 'lang' => true, + 'language' => true, + 'link' => true, + 'media' => true, + 'method' => true, + 'multiple' => true, + 'nohref' => true, + 'noresize' => true, + 'noshade' => true, + 'nowrap' => true, + 'readonly' => true, + 'rel' => true, + 'rev' => true, + 'rules' => true, + 'scope' => true, + 'scrolling' => true, + 'selected' => true, + 'shape' => true, + 'target' => true, + 'text' => true, + 'type' => true, + 'valign' => true, + 'valuetype' => true, + 'vlink' => true, + ); + + /** + * The name of the attribute to match. + * + * @var string + */ + public $name; + + /** + * The attribute matcher. + * + * Allowed string values are the class constants: + * - {@see WP_CSS_Attribute_Selector::MATCH_EXACT} + * - {@see WP_CSS_Attribute_Selector::MATCH_ONE_OF_EXACT} + * - {@see WP_CSS_Attribute_Selector::MATCH_EXACT_OR_HYPHEN_SUFFIXED} + * - {@see WP_CSS_Attribute_Selector::MATCH_PREFIXED_BY} + * - {@see WP_CSS_Attribute_Selector::MATCH_SUFFIXED_BY} + * - {@see WP_CSS_Attribute_Selector::MATCH_CONTAINS} + * + * @var string|null + */ + public $matcher; + + /** + * The attribute value to match. + * + * @var string|null + */ + public $value; + + /** + * The attribute modifier. + * + * Allowed string values are the class constants: + * - {@see WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE} + * - {@see WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE} + * + * @var string|null + */ + public $modifier; + + /** + * Constructor. + * + * @param string $name The attribute name. + * @param string|null $matcher The attribute matcher. + * Must be one of the class MATCH_* constants or null. + * @param string|null $value The attribute value to match. + * @param string|null $modifier The attribute case modifier. + * Must be one of the class MODIFIER_* constants or null. + */ + private function __construct( string $name, ?string $matcher = null, ?string $value = null, ?string $modifier = null ) { + $this->name = $name; + $this->matcher = $matcher; + $this->value = $value; + $this->modifier = $modifier; + } + + /** + * Determines if the processor's current position matches the selector. + * + * @param WP_HTML_Tag_Processor $processor The processor. + * @return bool True if the processor's current position matches the selector. + */ + public function matches( WP_HTML_Tag_Processor $processor ): bool { + $attr_value = $processor->get_attribute( $this->name ); + if ( null === $attr_value ) { + return false; + } + + if ( null === $this->value ) { + return true; + } + + /* + * The substring matchers match nothing when the value is empty: + * + * > If "val" is the empty string then the selector does not represent anything. + * + * https://www.w3.org/TR/selectors-4/#attribute-substrings + */ + if ( + '' === $this->value && + ( + self::MATCH_PREFIXED_BY === $this->matcher || + self::MATCH_SUFFIXED_BY === $this->matcher || + self::MATCH_CONTAINS === $this->matcher + ) + ) { + return false; + } + + if ( true === $attr_value ) { + $attr_value = ''; + } + + /* + * Without an explicit modifier, HTML defines some attributes' values + * as ASCII case-insensitive on HTML elements. An explicit `s` + * modifier forces case-sensitive matching even for those. + */ + $case_insensitive = self::MODIFIER_CASE_INSENSITIVE === $this->modifier || ( + null === $this->modifier && + 'html' === $processor->get_namespace() && + isset( self::HTML_CASE_INSENSITIVE_ATTRIBUTE_VALUES[ strtolower( $this->name ) ] ) + ); + + switch ( $this->matcher ) { + case self::MATCH_EXACT: + return $case_insensitive + ? 0 === strcasecmp( $attr_value, $this->value ) + : $attr_value === $this->value; + + case self::MATCH_ONE_OF_EXACT: + foreach ( $this->whitespace_delimited_list( $attr_value ) as $val ) { + if ( + $case_insensitive + ? 0 === strcasecmp( $val, $this->value ) + : $val === $this->value + ) { + return true; + } + } + return false; + + case self::MATCH_EXACT_OR_HYPHEN_SUFFIXED: + $exact_length = strlen( $this->value ); + $matches_prefix = substr_compare( $attr_value, $this->value, 0, $exact_length, $case_insensitive ); + return ( + 0 === $matches_prefix && + ( strlen( $attr_value ) === $exact_length || '-' === $attr_value[ $exact_length ] ) + ); + + case self::MATCH_PREFIXED_BY: + return 0 === substr_compare( $attr_value, $this->value, 0, strlen( $this->value ), $case_insensitive ); + + case self::MATCH_SUFFIXED_BY: + return 0 === substr_compare( $attr_value, $this->value, -strlen( $this->value ), null, $case_insensitive ); + + case self::MATCH_CONTAINS: + return false !== ( + $case_insensitive + ? stripos( $attr_value, $this->value ) + : strpos( $attr_value, $this->value ) + ); + } + } + + /** + * Splits a string into a list of whitespace delimited values. + * + * This is useful for the {@see WP_CSS_Attribute_Selector::MATCH_ONE_OF_EXACT} matcher. + * + * @param string $input + * + * @return Generator Yields each whitespace-delimited value from the input string. + */ + private function whitespace_delimited_list( string $input ): Generator { + // Start by skipping whitespace. + $offset = strspn( $input, self::WHITESPACE_CHARACTERS ); + + while ( $offset < strlen( $input ) ) { + // Find the byte length until the next boundary. + $length = strcspn( $input, self::WHITESPACE_CHARACTERS, $offset ); + $value = substr( $input, $offset, $length ); + + // Move past trailing whitespace. + $offset += $length + strspn( $input, self::WHITESPACE_CHARACTERS, $offset + $length ); + + yield $value; + } + } + + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * The end of input acts like a closing `]`: tokenization auto-closes + * unterminated simple blocks (and unterminated strings) at EOF, so + * `[att=val` is the same selector as `[att=val]`. Truncation inside the + * selector grammar itself (e.g. `[` or `[att=`) is still invalid. + * + * https://www.w3.org/TR/css-syntax-3/#consume-simple-block + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + // Need at least 2 bytes `[x`; the closing `]` may be supplied by the end of input. + if ( $offset + 1 >= strlen( $input ) ) { + return null; + } + + $updated_offset = $offset; + + if ( '[' !== $input[ $updated_offset ] ) { + return null; + } + ++$updated_offset; + + self::parse_whitespace( $input, $updated_offset ); + $attr_name = self::parse_ident( $input, $updated_offset ); + if ( null === $attr_name ) { + return null; + } + self::parse_whitespace( $input, $updated_offset ); + + // The end of input auto-closes the attribute selector. + if ( $updated_offset >= strlen( $input ) ) { + $offset = $updated_offset; + return new WP_CSS_Attribute_Selector( $attr_name ); + } + + if ( ']' === $input[ $updated_offset ] ) { + $offset = $updated_offset + 1; + return new WP_CSS_Attribute_Selector( $attr_name ); + } + + if ( '=' === $input[ $updated_offset ] ) { + ++$updated_offset; + $attr_matcher = WP_CSS_Attribute_Selector::MATCH_EXACT; + } elseif ( $updated_offset + 1 < strlen( $input ) && '=' === $input[ $updated_offset + 1 ] ) { + switch ( $input[ $updated_offset ] ) { + case '~': + $attr_matcher = WP_CSS_Attribute_Selector::MATCH_ONE_OF_EXACT; + $updated_offset += 2; + break; + case '|': + $attr_matcher = WP_CSS_Attribute_Selector::MATCH_EXACT_OR_HYPHEN_SUFFIXED; + $updated_offset += 2; + break; + case '^': + $attr_matcher = WP_CSS_Attribute_Selector::MATCH_PREFIXED_BY; + $updated_offset += 2; + break; + case '$': + $attr_matcher = WP_CSS_Attribute_Selector::MATCH_SUFFIXED_BY; + $updated_offset += 2; + break; + case '*': + $attr_matcher = WP_CSS_Attribute_Selector::MATCH_CONTAINS; + $updated_offset += 2; + break; + default: + return null; + } + } else { + return null; + } + + self::parse_whitespace( $input, $updated_offset ); + $attr_val = + self::parse_string( $input, $updated_offset ) ?? + self::parse_ident( $input, $updated_offset ); + + if ( null === $attr_val ) { + return null; + } + + self::parse_whitespace( $input, $updated_offset ); + + $attr_modifier = null; + if ( $updated_offset < strlen( $input ) ) { + switch ( $input[ $updated_offset ] ) { + case 'i': + case 'I': + $attr_modifier = WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE; + ++$updated_offset; + break; + + case 's': + case 'S': + $attr_modifier = WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE; + ++$updated_offset; + break; + } + + if ( null !== $attr_modifier ) { + self::parse_whitespace( $input, $updated_offset ); + } + } + + // The end of input auto-closes the attribute selector. + if ( $updated_offset >= strlen( $input ) ) { + $offset = $updated_offset; + return new self( $attr_name, $attr_matcher, $attr_val, $attr_modifier ); + } + + if ( ']' === $input[ $updated_offset ] ) { + $offset = $updated_offset + 1; + return new self( $attr_name, $attr_matcher, $attr_val, $attr_modifier ); + } + + return null; + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-class-selector.php b/src/wp-includes/html-api/css/class-wp-css-class-selector.php new file mode 100644 index 0000000000000..121b3abf10f96 --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-class-selector.php @@ -0,0 +1,71 @@ +class_name = $class_name; + } + + /** + * Determines if the processor's current position matches the selector. + * + * @param WP_HTML_Tag_Processor $processor The processor. + * @return bool True if the processor's current position matches the selector. + */ + public function matches( WP_HTML_Tag_Processor $processor ): bool { + return (bool) $processor->has_class( $this->class_name ); + } + + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + if ( $offset + 1 >= strlen( $input ) || '.' !== $input[ $offset ] ) { + return null; + } + + $updated_offset = $offset + 1; + $result = self::parse_ident( $input, $updated_offset ); + + if ( null === $result ) { + return null; + } + + $offset = $updated_offset; + return new self( $result ); + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-complex-selector-list.php b/src/wp-includes/html-api/css/class-wp-css-complex-selector-list.php new file mode 100644 index 0000000000000..da5e17011e0d8 --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-complex-selector-list.php @@ -0,0 +1,81 @@ + in the grammar. + * See {@see WP_CSS_Compound_Selector_List} for more details on the grammar. + * + * This class supports the same selector syntax as {@see WP_CSS_Compound_Selector_List} as well as + * the following combinators: + * - Descendant (`ancestor descendant`) + * - Child (`parent > child`) + * + * Combinators may only be used with type selectors in the non-final position, for example: + * - `div [type=input]` is valid because the `div` type selector appears in a non-final position. + * - `[disabled] option` is NOT valid, because the `[disabled]` attribute selector appears + * in a non-final position. + * + * These combinators are not supported: + * - Next sibling (`former-sibling + next-sibling`) + * - Subsequent sibling (`former-sibling ~ subsequent-sibling`) + * + * @since {WP_VERSION} + * + * @access private + */ +class WP_CSS_Complex_Selector_List extends WP_CSS_Compound_Selector_List { + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + $selector = WP_CSS_Complex_Selector::parse( $input, $offset ); + if ( null === $selector ) { + return null; + } + self::parse_whitespace( $input, $offset ); + + $selectors = array( $selector ); + while ( $offset < strlen( $input ) ) { + // Each loop should stop on a `,` selector list delimiter. + if ( ',' !== $input[ $offset ] ) { + return null; + } + ++$offset; + self::parse_whitespace( $input, $offset ); + $selector = WP_CSS_Complex_Selector::parse( $input, $offset ); + if ( null === $selector ) { + return null; + } + $selectors[] = $selector; + self::parse_whitespace( $input, $offset ); + } + + return new self( $selectors ); + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-complex-selector.php b/src/wp-includes/html-api/css/class-wp-css-complex-selector.php new file mode 100644 index 0000000000000..fd05c29daba91 --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-complex-selector.php @@ -0,0 +1,260 @@ +'; + + /** + * Descendant combinator. + */ + const COMBINATOR_DESCENDANT = ' '; + + /** + * Next sibling combinator. + * + * This combinator is not currently supported. + */ + const COMBINATOR_NEXT_SIBLING = '+'; + + /** + * Subsequent sibling combinator. + * + * This combinator is not currently supported. + */ + const COMBINATOR_SUBSEQUENT_SIBLING = '~'; + + /** + * The "self selector" is the last element in a complex selector, it corresponds to the + * selected element. + * + * Example: + * + * $self_selector + * ┏━━━━┻━━━━┓ + * .heading h1 > el.selected + * + * @readonly + * @var WP_CSS_Compound_Selector + */ + public $self_selector; + + /** + * The "context selectors" are zero or more elements that provide additional constraints for + * the "self selector." + * + * These selectors are represented as 2-tuples where the element at index 0 is the selector and + * the element at index 1 is the combinator string constant from this class, + * e.g. `WP_CSS_Complex_Selector::COMBINATOR_CHILD`. + * + * In the example selector below, an element like `` matches iff: + * - it is a child of an `H1` element + * - that `H1` element is a descendant of a `SECTION` element. + * + * The `section` and `h1` parts of this selector and their combinators are the + * "context selectors." Note that this terminology does not correspond to language in the + * specification texts. + * + * $context_selectors + * ┏━━━━━┻━━━━┓ + * section h1 > strong.selected + * + * The example would have the following context selectors: + * + * // Pseudo-code + * array( + * array( WP_CSS_Type_Selector( 'type'=>'h1' ), '>' ), + * array( WP_CSS_Type_Selector( 'type'=>'section' ), ' ' ), + * ) + * + * Context selectors are ordered from right to left in the selector text. The selectors closest + * to the target appear at the start of the `context_selectors` array. + * + * @readonly + * @var array{WP_CSS_Type_Selector, string}[]|null + */ + public $context_selectors; + + /** + * Constructor. + * + * @param WP_CSS_Compound_Selector $self_selector The selector in the final position. + * @param array{WP_CSS_Type_Selector, string}[]|null $context_selectors The context selectors. + */ + private function __construct( + WP_CSS_Compound_Selector $self_selector, + ?array $context_selectors + ) { + $this->self_selector = $self_selector; + $this->context_selectors = $context_selectors; + } + + /** + * Determines if the processor's current position matches the selector. + * + * @param WP_HTML_Processor $processor The processor. + * @return bool True if the processor's current position matches the selector. + */ + public function matches( $processor ): bool { + // First selector must match this location. + if ( ! $this->self_selector->matches( $processor ) ) { + return false; + } + + if ( null === $this->context_selectors || array() === $this->context_selectors ) { + return true; + } + + $breadcrumbs = array_slice( array_reverse( $processor->get_breadcrumbs() ), 1 ); + return $this->explore_matches( $this->context_selectors, $breadcrumbs ); + } + + /** + * Checks for matches by recursively comparing context selectors with breadcrumbs. + * + * @param array{WP_CSS_Type_Selector, string}[] $selectors Selectors to match. + * @param string[] $breadcrumbs Breadcrumbs. + * @return bool True if a match is found, otherwise false. + */ + private function explore_matches( array $selectors, array $breadcrumbs ): bool { + if ( array() === $selectors ) { + return true; + } + if ( array() === $breadcrumbs ) { + return false; + } + + $selector = $selectors[0][0]; + $combinator = $selectors[0][1]; + + switch ( $combinator ) { + case self::COMBINATOR_CHILD: + if ( $selector->matches_tag( $breadcrumbs[0] ) ) { + return $this->explore_matches( array_slice( $selectors, 1 ), array_slice( $breadcrumbs, 1 ) ); + } + return false; + + case self::COMBINATOR_DESCENDANT: + // Find _all_ the breadcrumbs that match and recurse from each of them. + for ( $i = 0; $i < count( $breadcrumbs ); $i++ ) { + if ( $selector->matches_tag( $breadcrumbs[ $i ] ) ) { + $next_breadcrumbs = array_slice( $breadcrumbs, $i + 1 ); + if ( $this->explore_matches( array_slice( $selectors, 1 ), $next_breadcrumbs ) ) { + return true; + } + } + } + return false; + + default: + _doing_it_wrong( + __METHOD__, + sprintf( + // translators: %s: A CSS selector combinator like ">" or "+". + __( 'Unsupported combinator "%s" found.' ), + $combinator + ), + '{WP_VERSION}' + ); + return false; + } + } + + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + if ( $offset >= strlen( $input ) ) { + return null; + } + + $updated_offset = $offset; + $self_selector = WP_CSS_Compound_Selector::parse( $input, $updated_offset ); + if ( null === $self_selector ) { + return null; + } + /** @var array{WP_CSS_Compound_Selector, string}[] */ + $selectors = array(); + + $found_whitespace = self::parse_whitespace( $input, $updated_offset ); + while ( $updated_offset < strlen( $input ) ) { + $combinator = null; + $next_selector = null; + + // Sibling (`+` and `~`) combinators are not supported at this time. + if ( + WP_CSS_Complex_Selector::COMBINATOR_NEXT_SIBLING === $input[ $updated_offset ] || + WP_CSS_Complex_Selector::COMBINATOR_SUBSEQUENT_SIBLING === $input[ $updated_offset ] + ) { + return null; + } elseif ( + WP_CSS_Complex_Selector::COMBINATOR_CHILD === $input[ $updated_offset ] + ) { + $combinator = $input[ $updated_offset ]; + ++$updated_offset; + self::parse_whitespace( $input, $updated_offset ); + + // A combinator has been found, failure to find a selector here is a parse error. + $next_selector = WP_CSS_Compound_Selector::parse( $input, $updated_offset ); + if ( null === $next_selector ) { + return null; + } + } elseif ( $found_whitespace ) { + /* + * Whitespace is ambiguous, it could be a descendant combinator or + * insignificant whitespace. + */ + $next_selector = WP_CSS_Compound_Selector::parse( $input, $updated_offset ); + if ( null !== $next_selector ) { + $combinator = WP_CSS_Complex_Selector::COMBINATOR_DESCENDANT; + } + } + + if ( null === $next_selector ) { + break; + } + + // $self_selector will pass to a relative selector where only the type selector is allowed. + if ( null !== $self_selector->subclass_selectors || null === $self_selector->type_selector ) { + return null; + } + + /** @var array{WP_CSS_Type_Selector, string} */ + $selector_pair = array( $self_selector->type_selector, $combinator ); + $selectors[] = $selector_pair; + $self_selector = $next_selector; + + $found_whitespace = self::parse_whitespace( $input, $updated_offset ); + } + $offset = $updated_offset; + + return new self( $self_selector, array_reverse( $selectors ) ); + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-compound-selector-list.php b/src/wp-includes/html-api/css/class-wp-css-compound-selector-list.php new file mode 100644 index 0000000000000..4042b1bc94f3e --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-compound-selector-list.php @@ -0,0 +1,191 @@ + in the grammar. The supported grammar is: + * + * = + * = # + * = # + * = [ ? ]* + * = [ ? * ]! + * = '>' | [ '|' '|' ] + * = | '*' + * = | | + * = + * = '.' + * = '[' ']' | + * '[' [ | ] ? ']' + * = [ '~' | '|' | '^' | '$' | '*' ]? '=' + * = i | s + * + * @link https://www.w3.org/TR/selectors/#grammar Refer to the grammar for more details. + * + * This class of selectors does not support "complex" selectors. That is any selector with a + * combinator such as descendant (`.ancestor .descendant`) or child (`.parent > .child`). + * See {@see WP_CSS_Complex_Selector_List} for support of some combinators. + * + * Note that this grammar has been adapted and does not support the full CSS selector grammar. + * Supported selector syntax: + * - Type selectors (tag names, e.g. `div`) + * - Class selectors (e.g. `.class-name`) + * - ID selectors (e.g. `#unique-id`) + * - Attribute selectors (e.g. `[attribute-name]` or `[attribute-name="value"]`) + * - Comma-separated selector lists (e.g. `.selector-1, .selector-2`) + * - Compound selectors (e.g. `div.class-name#id[attr]`) + * + * Unsupported selector syntax: + * - Pseudo-element selectors (`::before`) + * - Pseudo-class selectors (`:hover` or `:nth-child(2)`) + * - Namespace prefixes (`svg|title` or `[xlink|href]`) + * - Combinators are not supported by this class (descendant, child, next sibling, + * subsequent sibling). See {@see WP_CSS_Complex_Selector_List} for combinator support. + * + * Future ideas: + * - Namespace type selectors could be implemented with select namespaces in order to + * select elements from a namespace, for example: + * - `svg|*` to select all SVG elements + * - `html|title` to select only HTML TITLE elements. + * + * @since {WP_VERSION} + * + * @access private + * + * @link https://www.w3.org/TR/css-syntax-3/ + * @link https://www.w3.org/tr/selectors/ + * @link https://www.w3.org/TR/selectors-api2/ + * @link https://www.w3.org/TR/selectors-4/ + */ +class WP_CSS_Compound_Selector_List extends WP_CSS_Selector_Parser_Matcher { + /** + * Determines if the processor's current position matches the selector. + * + * @param WP_HTML_Tag_Processor $processor The processor. + * @return bool True if the processor's current position matches the selector. + */ + public function matches( $processor ): bool { + if ( $processor->get_token_type() !== '#tag' ) { + return false; + } + + foreach ( $this->selectors as $selector ) { + if ( $selector->matches( $processor ) ) { + return true; + } + } + return false; + } + + /** + * Array of selectors. + * + * @var array + */ + private $selectors; + + /** + * Constructor. + * + * @param array $selectors Array of selectors. + */ + protected function __construct( array $selectors ) { + $this->selectors = $selectors; + } + + /** + * Takes a CSS selector string and returns an instance of itself or `null` if the selector + * string is invalid or unsupported. + * + * The selector string must be UTF-8: ill-formed byte sequences are replaced with + * U+FFFD per maximal subpart before parsing and reported with `_doing_it_wrong()`. + * See the "Text Encoding" section of the class documentation. + * + * @param string $input CSS selectors. + * @return static|null + */ + public static function from_selectors( string $input ) { + $input = self::normalize_selector_input( $input ); + + if ( '' === $input ) { + return null; + } + + $offset = 0; + return static::parse( $input, $offset ); + } + + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + $selector = WP_CSS_Compound_Selector::parse( $input, $offset ); + if ( null === $selector ) { + return null; + } + self::parse_whitespace( $input, $offset ); + + $selectors = array( $selector ); + while ( $offset < strlen( $input ) ) { + // Each loop should stop on a `,` selector list delimiter. + if ( ',' !== $input[ $offset ] ) { + return null; + } + ++$offset; + self::parse_whitespace( $input, $offset ); + $selector = WP_CSS_Compound_Selector::parse( $input, $offset ); + if ( null === $selector ) { + return null; + } + $selectors[] = $selector; + self::parse_whitespace( $input, $offset ); + } + + return new self( $selectors ); + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-compound-selector.php b/src/wp-includes/html-api/css/class-wp-css-compound-selector.php new file mode 100644 index 0000000000000..48e206819c0d3 --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-compound-selector.php @@ -0,0 +1,130 @@ +type_selector = $type_selector; + $this->subclass_selectors = array() === $subclass_selectors ? null : $subclass_selectors; + } + + /** + * Determines if the processor's current position matches the selector. + * + * @param WP_HTML_Tag_Processor $processor The processor. + * @return bool True if the processor's current position matches the selector. + */ + public function matches( WP_HTML_Tag_Processor $processor ): bool { + if ( $this->type_selector && ! $this->type_selector->matches( $processor ) ) { + return false; + } + if ( null !== $this->subclass_selectors ) { + foreach ( $this->subclass_selectors as $subclass_selector ) { + if ( ! $subclass_selector->matches( $processor ) ) { + return false; + } + } + } + return true; + } + + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + if ( $offset >= strlen( $input ) ) { + return null; + } + + $updated_offset = $offset; + $type_selector = WP_CSS_Type_Selector::parse( $input, $updated_offset ); + + $subclass_selectors = array(); + $last_parsed_subclass_selector = self::parse_subclass_selector( $input, $updated_offset ); + while ( null !== $last_parsed_subclass_selector ) { + $subclass_selectors[] = $last_parsed_subclass_selector; + $last_parsed_subclass_selector = self::parse_subclass_selector( $input, $updated_offset ); + } + + // There must be at least one selector. + if ( null === $type_selector && array() === $subclass_selectors ) { + return null; + } + + $offset = $updated_offset; + return new self( $type_selector, $subclass_selectors ); + } + + /** + * Parses a subclass selector. + * + * > = | | + * + * @return WP_CSS_ID_Selector|WP_CSS_Class_Selector|WP_CSS_Attribute_Selector|null + */ + private static function parse_subclass_selector( string $input, int &$offset ) { + if ( $offset >= strlen( $input ) ) { + return null; + } + + switch ( $input[ $offset ] ) { + case '.': + return WP_CSS_Class_Selector::parse( $input, $offset ); + case '#': + return WP_CSS_ID_Selector::parse( $input, $offset ); + case '[': + return WP_CSS_Attribute_Selector::parse( $input, $offset ); + } + + return null; + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-id-selector.php b/src/wp-includes/html-api/css/class-wp-css-id-selector.php new file mode 100644 index 0000000000000..e2e47a24d1e6c --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-id-selector.php @@ -0,0 +1,72 @@ +id = $id; + } + + /** + * Determines if the processor's current position matches the selector. + * + * @param WP_HTML_Tag_Processor $processor The processor. + * @return bool True if the processor's current position matches the selector. + */ + public function matches( WP_HTML_Tag_Processor $processor ): bool { + $id = $processor->get_attribute( 'id' ); + if ( ! is_string( $id ) ) { + return false; + } + + $case_insensitive = $processor->is_quirks_mode(); + + return $case_insensitive + ? 0 === strcasecmp( $id, $this->id ) + : $id === $this->id; + } + + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + $ident = self::parse_hash_token( $input, $offset ); + if ( null === $ident ) { + return null; + } + return new self( $ident ); + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-selector-parser-matcher.php b/src/wp-includes/html-api/css/class-wp-css-selector-parser-matcher.php new file mode 100644 index 0000000000000..14d9d28a771cc --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-selector-parser-matcher.php @@ -0,0 +1,573 @@ + 0; + $offset += $length; + return $advanced; + } + + /** + * Tokenization of hash tokens + * + * > U+0023 NUMBER SIGN (#) + * > If the next input code point is an ident code point or the next two input code points are a valid escape, then: + * > 1. Create a . + * > 2. If the next 3 input code points would start an ident sequence, set the + * > ’s type flag to "id". + * > 3. Consume an ident sequence, and set the ’s value to the + * > returned string. + * > 4. Return the . + * > Otherwise, return a with its value set to the current input code point. + * + * This implementation is not interested in the , a '#' delim token is not relevant for selectors. + */ + final protected static function parse_hash_token( string $input, int &$offset ): ?string { + if ( $offset + 1 >= strlen( $input ) || '#' !== $input[ $offset ] ) { + return null; + } + + $updated_offset = $offset + 1; + $result = self::parse_ident( $input, $updated_offset ); + + if ( null === $result ) { + return null; + } + + $offset = $updated_offset; + return $result; + } + + /** + * Parse a string token + * + * > 4.3.5. Consume a string token + * > This section describes how to consume a string token from a stream of code points. It returns either a or . + * > + * > This algorithm may be called with an ending code point, which denotes the code point that ends the string. If an ending code point is not specified, the current input code point is used. + * > + * > Initially create a with its value set to the empty string. + * > + * > Repeatedly consume the next input code point from the stream: + * > + * > ending code point + * > Return the . + * > EOF + * > This is a parse error. Return the . + * > newline + * > This is a parse error. Reconsume the current input code point, create a , and return it. + * > U+005C REVERSE SOLIDUS (\) + * > If the next input code point is EOF, do nothing. + * > Otherwise, if the next input code point is a newline, consume it. + * > Otherwise, (the stream starts with a valid escape) consume an escaped code point and append the returned code point to the ’s value. + * > + * > anything else + * > Append the current input code point to the ’s value. + * + * https://www.w3.org/TR/css-syntax-3/#consume-string-token + * + * This implementation will never return a because + * the is not a part of the selector grammar. That + * case is treated as failure to parse and null is returned. + * + * @return string|null The parsed string token value, or null if parsing failed. + */ + final protected static function parse_string( string $input, int &$offset ): ?string { + if ( $offset >= strlen( $input ) ) { + return null; + } + + $ending_code_point = $input[ $offset ]; + if ( '"' !== $ending_code_point && "'" !== $ending_code_point ) { + return null; + } + + $string_token = ''; + + $updated_offset = $offset + 1; + $anything_else_mask = "\\\n{$ending_code_point}"; + while ( $updated_offset < strlen( $input ) ) { + $anything_else_length = strcspn( $input, $anything_else_mask, $updated_offset ); + if ( $anything_else_length > 0 ) { + $string_token .= substr( $input, $updated_offset, $anything_else_length ); + $updated_offset += $anything_else_length; + + if ( $updated_offset >= strlen( $input ) ) { + break; + } + } + + switch ( $input[ $updated_offset ] ) { + case '\\': + ++$updated_offset; + if ( $updated_offset >= strlen( $input ) ) { + break; + } + if ( "\n" === $input[ $updated_offset ] ) { + ++$updated_offset; + break; + } else { + $string_token .= self::consume_escaped_codepoint( $input, $updated_offset ); + } + break; + + /* + * This case would return a . + * The is not a part of the selector grammar + * so we do not return it and instead treat this as a + * failure to parse a string token. + */ + case "\n": + return null; + + case $ending_code_point: + ++$updated_offset; + break 2; + } + } + + $offset = $updated_offset; + return $string_token; + } + + /** + * Consume an escaped code point. + * + * > 4.3.7. Consume an escaped code point + * > This section describes how to consume an escaped code point. It assumes that the U+005C + * > REVERSE SOLIDUS (\) has already been consumed and that the next input code point has + * > already been verified to be part of a valid escape. It will return a code point. + * > + * > Consume the next input code point. + * > + * > hex digit + * > Consume as many hex digits as possible, but no more than 5. Note that this means 1-6 + * > hex digits have been consumed in total. If the next input code point is whitespace, + * > consume it as well. Interpret the hex digits as a hexadecimal number. If this number is + * > zero, or is for a surrogate, or is greater than the maximum allowed code point, return + * > U+FFFD REPLACEMENT CHARACTER (�). Otherwise, return the code point with that value. + * > EOF + * > This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�). + * > anything else + * > Return the current input code point. + * + * @param string $input + * @param int $offset + * @return string + */ + final protected static function consume_escaped_codepoint( $input, &$offset ): string { + /* + * > EOF + * > This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�). + */ + if ( $offset >= strlen( $input ) ) { + return "\u{FFFD}"; + } + + $hex_length = strspn( $input, '0123456789abcdefABCDEF', $offset, 6 ); + if ( $hex_length > 0 ) { + /** + * The 6-character hex string has a maximum value of 0xFFFFFF. + * It is likely to fit in an int value and not be a float. + * + * @var int + */ + $codepoint_value = hexdec( substr( $input, $offset, $hex_length ) ); + + /* + * > A surrogate is a leading surrogate or a trailing surrogate. + * > A leading surrogate is a code point that is in the range U+D800 to U+DBFF, inclusive. + * > A trailing surrogate is a code point that is in the range U+DC00 to U+DFFF, inclusive. + * + * The surrogate ranges are adjacent, so the complete range is 0xD800 to 0xDFFF, inclusive. + */ + $codepoint_char = ( + 0 === $codepoint_value || + $codepoint_value > self::UTF8_MAX_CODEPOINT_VALUE || + ( 0xD800 <= $codepoint_value && $codepoint_value <= 0xDFFF ) + ) + ? "\u{FFFD}" + : mb_chr( $codepoint_value, 'UTF-8' ); + + $offset += $hex_length; + + // If the next input code point is whitespace, consume it as well. + if ( + strlen( $input ) > $offset && + ( + "\n" === $input[ $offset ] || + "\t" === $input[ $offset ] || + ' ' === $input[ $offset ] + ) + ) { + ++$offset; + } + return $codepoint_char; + } + + /* + * Find the byte length of the code point at $offset without copying the rest + * of the input: a code point is at most 4 bytes, so the scan is bounded and + * an escape of valid UTF-8 decodes in O(1) regardless of selector length. + * + * `_wp_utf8_codepoint_span()` is not suitable here: it does not bound the + * scan, so its ASCII fast-path reads to the end of the input on every call, + * which is quadratic over a selector composed of escapes. + */ + $at = $offset; + $invalid_length = 0; + _wp_scan_utf8( $input, $at, $invalid_length, 4, 1 ); + if ( $at > $offset ) { + $codepoint_char = substr( $input, $offset, $at - $offset ); + $offset = $at; + return $codepoint_char; + } + + /* + * The bytes at $offset are not valid UTF-8, which can only happen when + * `parse()` was called directly with un-normalized input: the public + * `from_selectors()` API replaces ill-formed byte sequences with U+FFFD + * before parsing. Decode consistently with that normalization — consume + * the maximal subpart of the ill-formed sequence, whose length the scan + * above reported, and return a single U+FFFD. + */ + $offset += max( 1, $invalid_length ); + return "\u{FFFD}"; + } + + /** + * Parse an ident token + * + * CAUTION: This method is _not_ for parsing an ID selector! + * + * > 4.3.11. Consume an ident sequence + * > This section describes how to consume an ident sequence from a stream of code points. It returns a string containing the largest name that can be formed from adjacent code points in the stream, starting from the first. + * > + * > Note: This algorithm does not do the verification of the first few code points that are necessary to ensure the returned code points would constitute an . If that is the intended use, ensure that the stream starts with an ident sequence before calling this algorithm. + * > + * > Let result initially be an empty string. + * > + * > Repeatedly consume the next input code point from the stream: + * > + * > ident code point + * > Append the code point to result. + * > the stream starts with a valid escape + * > Consume an escaped code point. Append the returned code point to result. + * > anything else + * > Reconsume the current input code point. Return result. + * + * https://www.w3.org/TR/css-syntax-3/#consume-name + * + * @return string|null The parsed identifier name, or null if parsing failed. + */ + final protected static function parse_ident( string $input, int &$offset ): ?string { + if ( ! self::check_if_three_code_points_would_start_an_ident_sequence( $input, $offset ) ) { + return null; + } + + $ident = ''; + + while ( $offset < strlen( $input ) ) { + if ( self::next_two_are_valid_escape( $input, $offset ) ) { + // Move past the `\` character. + ++$offset; + $ident .= self::consume_escaped_codepoint( $input, $offset ); + continue; + } elseif ( self::is_ident_codepoint( $input, $offset ) ) { + $ident .= $input[ $offset ]; + ++$offset; + continue; + } + break; + } + + return $ident; + } + + /* + * -------------------------- + * Selector parsing utilities + * -------------------------- + * + * The following functions are used for parsing but do not consume any input. + */ + + /** + * Checks for two valid escape codepoints. + * + * > 4.3.8. Check if two code points are a valid escape + * > This section describes how to check if two code points are a valid escape. The algorithm + * > described here can be called explicitly with two code points, or can be called with the + * > input stream itself. In the latter case, the two code points in question are the current + * > input code point and the next input code point, in that order. + * > + * > Note: This algorithm will not consume any additional code point. + * > + * > If the first code point is not U+005C REVERSE SOLIDUS (\), return false. + * > + * > Otherwise, if the second code point is a newline, return false. + * > + * > Otherwise, return true. + * + * https://www.w3.org/TR/css-syntax-3/#starts-with-a-valid-escape + * + * @todo The second codepoint is not checked for validity. + * + * @param string $input The input string. + * @param int $offset The byte offset in the string. + * @return bool True if the next two codepoints are a valid escape, otherwise false. + */ + final protected static function next_two_are_valid_escape( string $input, int $offset ): bool { + if ( $offset >= strlen( $input ) ) { + return false; + } + + /* + * The second code point may be EOF. EOF is not a newline, so a + * backslash at the end of input is a valid escape; consuming it + * produces U+FFFD REPLACEMENT CHARACTER. + */ + return '\\' === $input[ $offset ] && + ( $offset + 1 >= strlen( $input ) || "\n" !== $input[ $offset + 1 ] ); + } + + /** + * Checks if the next code point is an "ident start code point." + * + * Caution! This method does not do any bounds checking, it should not be passed + * a string with an offset that is out of bounds. + * + * > ident-start code point + * > A letter, a non-ASCII code point, or U+005F LOW LINE (_). + * > uppercase letter + * > A code point between U+0041 LATIN CAPITAL LETTER A (A) and U+005A LATIN CAPITAL LETTER Z (Z) inclusive. + * > lowercase letter + * > A code point between U+0061 LATIN SMALL LETTER A (a) and U+007A LATIN SMALL LETTER Z (z) inclusive. + * > letter + * > An uppercase letter or a lowercase letter. + * > non-ASCII code point + * > A code point with a value equal to or greater than U+0080 . + * + * @link https://www.w3.org/TR/css-syntax-3/#ident-start-code-point + * + * @param string $input The input string. + * @param int $offset The byte offset in the string. + * @return bool True if the next codepoint is an ident start code point, otherwise false. + */ + final protected static function is_ident_start_codepoint( string $input, int $offset ): bool { + return ( + '_' === $input[ $offset ] || + ( 'a' <= $input[ $offset ] && $input[ $offset ] <= 'z' ) || + ( 'A' <= $input[ $offset ] && $input[ $offset ] <= 'Z' ) || + ord( $input[ $offset ] ) > 0x7F + ); + } + + /** + * Checks if the next code point is an "ident code point." + * + * Caution! This method does not do any bounds checking, it should not be passed + * a string with an offset that is out of bounds. + * + * > ident code point + * > An ident-start code point, a digit, or U+002D HYPHEN-MINUS (-). + * > digit + * > A code point between U+0030 DIGIT ZERO (0) and U+0039 DIGIT NINE (9) inclusive. + * + * @link https://www.w3.org/TR/css-syntax-3/#ident-code-point + * + * @param string $input The input string. + * @param int $offset The byte offset in the string. + * @return bool True if the next codepoint is an ident code point, otherwise false. + */ + final protected static function is_ident_codepoint( string $input, int $offset ): bool { + return '-' === $input[ $offset ] || + ( '0' <= $input[ $offset ] && $input[ $offset ] <= '9' ) || + self::is_ident_start_codepoint( $input, $offset ); + } + + /** + * Checks if three code points would start an ident sequence. + * + * > 4.3.9. Check if three code points would start an ident sequence + * > This section describes how to check if three code points would start an ident sequence. The algorithm described here can be called explicitly with three code points, or can be called with the input stream itself. In the latter case, the three code points in question are the current input code point and the next two input code points, in that order. + * > + * > Note: This algorithm will not consume any additional code points. + * > + * > Look at the first code point: + * > + * > U+002D HYPHEN-MINUS + * > If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or the second and third code points are a valid escape, return true. Otherwise, return false. + * > ident-start code point + * > Return true. + * > U+005C REVERSE SOLIDUS (\) + * > If the first and second code points are a valid escape, return true. Otherwise, return false. + * > anything else + * > Return false. + * + * @link https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier + * + * @param string $input The input string. + * @param int $offset The byte offset in the string. + * @return bool True if the next three codepoints would start an ident sequence, otherwise false. + */ + final protected static function check_if_three_code_points_would_start_an_ident_sequence( string $input, int $offset ): bool { + if ( $offset >= strlen( $input ) ) { + return false; + } + + // > U+005C REVERSE SOLIDUS (\) + if ( '\\' === $input[ $offset ] ) { + return self::next_two_are_valid_escape( $input, $offset ); + } + + // > U+002D HYPHEN-MINUS + if ( '-' === $input[ $offset ] ) { + $after_initial_hyphen_minus_offset = $offset + 1; + if ( $after_initial_hyphen_minus_offset >= strlen( $input ) ) { + return false; + } + + // > If the second code point is… U+002D HYPHEN-MINUS… return true + if ( '-' === $input[ $after_initial_hyphen_minus_offset ] ) { + return true; + } + + // > If the second and third code points are a valid escape… return true. + if ( self::next_two_are_valid_escape( $input, $after_initial_hyphen_minus_offset ) ) { + return true; + } + + // > If the second code point is an ident-start code point… return true. + if ( self::is_ident_start_codepoint( $input, $after_initial_hyphen_minus_offset ) ) { + return true; + } + + // > Otherwise, return false. + return false; + } + + // > ident-start code point + // > Return true. + // > anything else + // > Return false. + return self::is_ident_start_codepoint( $input, $offset ); + } + + /** + * Normalizes selector input for processing: decodes the byte stream as + * UTF-8 ( replacing ill-formed sequences with U+FFFD ), then filters the + * code points per the input-preprocessing rules. + * + * @see https://www.w3.org/TR/css-syntax-3/#input-byte-stream + * @see https://www.w3.org/TR/css-syntax-3/#input-preprocessing + * + * @param string $input The selector string. + * @return string The normalized selector string. + */ + final protected static function normalize_selector_input( string $input ): string { + /* + * > The input byte stream defines the byte stream that comprises a style sheet. + * > To decode bytes into a stream of code points… + * + * Selector strings are UTF-8 text. Decoding replaces each maximal + * subpart of an ill-formed byte sequence with U+FFFD REPLACEMENT + * CHARACTER (�), per the WHATWG Encoding Standard's UTF-8 decoder. + * The replaced selector is unlikely to match the elements the + * developer intended, so the replacement also reports a notice. + * + * https://www.w3.org/TR/css-syntax-3/#input-byte-stream + */ + $scrubbed = wp_scrub_utf8( $input ); + if ( $scrubbed !== $input ) { + _doing_it_wrong( + get_called_class() . '::from_selectors', + 'Selector strings must be valid UTF-8: ill-formed byte sequences were replaced with U+FFFD (�), which is unlikely to match the intended elements.', + '{WP_VERSION}' + ); + $input = $scrubbed; + } + + /* + * > A selector string is a list of one or more complex selectors ([SELECTORS4], section 3.1) that may be surrounded by whitespace… + * + * This list includes \f. + * + * Only leading whitespace is removed here. Trailing whitespace may be + * significant: a backslash may escape a final whitespace code point + * into an ident (`.foo\ ` is the class `foo `), and a backslash + * before a final newline is an invalid escape, while a backslash at + * the end of input is a valid escape that decodes to U+FFFD. The + * selector grammar consumes insignificant trailing whitespace itself. + */ + $input = ltrim( $input, " \t\r\n\f" ); + + /* + * > The input stream consists of the filtered code points pushed into it as the input byte stream is decoded. + * > + * > To filter code points from a stream of (unfiltered) code points input: + * > Replace any U+000D CARRIAGE RETURN (CR) code points, U+000C FORM FEED (FF) code points, or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF) in input by a single U+000A LINE FEED (LF) code point. + * > Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT CHARACTER (�). + * + * https://www.w3.org/TR/css-syntax-3/#input-preprocessing + */ + $input = str_replace( array( "\r\n" ), "\n", $input ); + $input = str_replace( array( "\r", "\f" ), "\n", $input ); + $input = str_replace( "\0", "\u{FFFD}", $input ); + + return $input; + } +} diff --git a/src/wp-includes/html-api/css/class-wp-css-type-selector.php b/src/wp-includes/html-api/css/class-wp-css-type-selector.php new file mode 100644 index 0000000000000..c7c7baa2d5508 --- /dev/null +++ b/src/wp-includes/html-api/css/class-wp-css-type-selector.php @@ -0,0 +1,90 @@ +type = $type; + } + + /** + * Determines if the processor's current position matches the selector. + * + * @param WP_HTML_Tag_Processor $processor The processor. + * @return bool True if the processor's current position matches the selector. + */ + public function matches( WP_HTML_Tag_Processor $processor ): bool { + $tag_name = $processor->get_tag(); + if ( null === $tag_name ) { + return false; + } + return $this->matches_tag( $tag_name ); + } + + /** + * Checks whether the selector matches the provided tag name. + * + * @param string $tag_name + * @return bool + */ + public function matches_tag( string $tag_name ): bool { + if ( '*' === $this->type ) { + return true; + } + return 0 === strcasecmp( $tag_name, $this->type ); + } + + /** + * Parses a selector string to create a selector instance. + * + * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method. + * + * @param string $input The selector string. + * @param int $offset The offset into the string. The offset is passed by reference and + * will be updated if the parse is successful. + * @return static|null The selector instance, or null if the parse was unsuccessful. + */ + public static function parse( string $input, int &$offset ) { + if ( $offset >= strlen( $input ) ) { + return null; + } + + if ( '*' === $input[ $offset ] ) { + ++$offset; + return new WP_CSS_Type_Selector( '*' ); + } + + $result = self::parse_ident( $input, $offset ); + if ( null === $result ) { + return null; + } + + return new self( $result ); + } +} diff --git a/src/wp-settings.php b/src/wp-settings.php index ef5c7784ee561..e9ff3af23a096 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -278,6 +278,15 @@ require ABSPATH . WPINC . '/html-api/class-wp-html-processor-state.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-processor.php'; require ABSPATH . WPINC . '/class-wp-block-processor.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-selector-parser-matcher.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-attribute-selector.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-class-selector.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-id-selector.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-type-selector.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-compound-selector.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-complex-selector.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-compound-selector-list.php'; +require ABSPATH . WPINC . '/html-api/css/class-wp-css-complex-selector-list.php'; require ABSPATH . WPINC . '/class-wp-http.php'; require ABSPATH . WPINC . '/class-wp-http-streams.php'; require ABSPATH . WPINC . '/class-wp-http-curl.php'; diff --git a/tests/phpunit/tests/html-api/wpCssAttributeSelector.php b/tests/phpunit/tests/html-api/wpCssAttributeSelector.php new file mode 100644 index 0000000000000..99051f2cc971c --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssAttributeSelector.php @@ -0,0 +1,123 @@ +assertNull( $result ); + } else { + $this->assertNotNull( $result, "Failed to parse attribute selector: {$input}" ); + $this->assertSame( $expected_name, $result->name ); + $this->assertSame( $expected_matcher, $result->matcher ); + $this->assertSame( $expected_value, $result->value ); + $this->assertSame( $expected_modifier, $result->modifier ); + $this->assertSame( $rest, substr( $input, $offset ) ); + } + } + + /** + * Data provider. + * + * @return array + */ + public static function data_attribute_selectors(): array { + return array( + '[href]' => array( '[href]', 'href', null, null, null, '' ), + '[href] type' => array( '[href] type', 'href', null, null, null, ' type' ), + '[href]#id' => array( '[href]#id', 'href', null, null, null, '#id' ), + '[href].class' => array( '[href].class', 'href', null, null, null, '.class' ), + '[href][href2]' => array( '[href][href2]', 'href', null, null, null, '[href2]' ), + '[\n href\t\r]' => array( "[\n href\t\r]", 'href', null, null, null, '' ), + '[href=foo]' => array( '[href=foo]', 'href', WP_CSS_Attribute_Selector::MATCH_EXACT, 'foo', null, '' ), + '[a=b]' => array( '[a=b]', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ), + '[href \n = bar ]' => array( "[href \n = bar ]", 'href', WP_CSS_Attribute_Selector::MATCH_EXACT, 'bar', null, '' ), + '[href \n ^= baz ]' => array( "[href \n ^= baz ]", 'href', WP_CSS_Attribute_Selector::MATCH_PREFIXED_BY, 'baz', null, '' ), + + '[match $= insensitive i]' => array( '[match $= insensitive i]', 'match', WP_CSS_Attribute_Selector::MATCH_SUFFIXED_BY, 'insensitive', WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE, '' ), + '[match|=sensitive s]' => array( '[match|=sensitive s]', 'match', WP_CSS_Attribute_Selector::MATCH_EXACT_OR_HYPHEN_SUFFIXED, 'sensitive', WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE, '' ), + '[att=val I]' => array( '[att=val I]', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE, '' ), + '[att=val S]' => array( '[att=val S]', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE, '' ), + + '[match~="quoted[][]"]' => array( '[match~="quoted[][]"]', 'match', WP_CSS_Attribute_Selector::MATCH_ONE_OF_EXACT, 'quoted[][]', null, '' ), + "[match$='quoted!{}']" => array( "[match$='quoted!{}']", 'match', WP_CSS_Attribute_Selector::MATCH_SUFFIXED_BY, 'quoted!{}', null, '' ), + "[match*='quoted's]" => array( "[match*='quoted's]", 'match', WP_CSS_Attribute_Selector::MATCH_CONTAINS, 'quoted', WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE, '' ), + + '[escape-nl="foo\\nbar"]' => array( "[escape-nl='foo\\\nbar']", 'escape-nl', WP_CSS_Attribute_Selector::MATCH_EXACT, 'foobar', null, '' ), + '[escape-seq="\\31 23"]' => array( "[escape-seq='\\31 23']", 'escape-seq', WP_CSS_Attribute_Selector::MATCH_EXACT, '123', null, '' ), + + /* + * The end of input closes an open attribute selector: tokenization + * auto-closes unterminated simple blocks (and strings) at EOF. + * + * https://www.w3.org/TR/css-syntax-3/#consume-simple-block + */ + 'EOF [foo' => array( '[foo', 'foo', null, null, null, '' ), + 'EOF [ \n foo' => array( "[ \n foo", 'foo', null, null, null, '' ), + 'EOF [foo ' => array( '[foo ', 'foo', null, null, null, '' ), + 'EOF [a=b' => array( '[a=b', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ), + 'EOF [att=val ' => array( '[att=val ', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', null, '' ), + 'EOF [a="b' => array( '[a="b', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ), + "EOF [a='b" => array( "[a='b", 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ), + 'EOF [a="b\\' => array( '[a="b\\', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ), + 'EOF [a=b\\' => array( '[a=b\\', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, "b\u{FFFD}", null, '' ), + 'EOF [a^=b' => array( '[a^=b', 'a', WP_CSS_Attribute_Selector::MATCH_PREFIXED_BY, 'b', null, '' ), + 'EOF [att=val i' => array( '[att=val i', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE, '' ), + 'EOF [att=val i ' => array( '[att=val i ', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE, '' ), + 'EOF [att="val"s' => array( '[att="val"s', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE, '' ), + + // Invalid + 'Invalid: (empty string)' => array( '' ), + 'Invalid: foo' => array( 'foo' ), + 'Invalid: [' => array( '[' ), + 'Invalid: [ ' => array( '[ ' ), + 'Invalid: [a=' => array( '[a=' ), + 'Invalid: [a= ' => array( '[a= ' ), + 'Invalid: [a~' => array( '[a~' ), + 'Invalid: [a=b x' => array( '[a=b x' ), + 'Invalid: [a i' => array( '[a i' ), + 'Invalid: [#foo]' => array( '[#foo]' ), + 'Invalid: [*|*]' => array( '[*|*]' ), + 'Invalid: [ns|*]' => array( '[ns|*]' ), + 'Invalid: [* |att]' => array( '[* |att]' ), + 'Invalid: [*| att]' => array( '[*| att]' ), + 'Invalid: [att * =]' => array( '[att * =]' ), + 'Invalid: [att+=val]' => array( '[att+=val]' ), + 'Invalid: [a=]' => array( '[a=]' ), + 'Invalid: [a~=]' => array( '[a~=]' ), + 'Invalid: [a==b]' => array( '[a==b]' ), + 'Invalid: [a=1]' => array( '[a=1]' ), + 'Invalid: [a=1' => array( '[a=1' ), + 'Invalid: [att i]' => array( '[att i]' ), + 'Invalid: [att s]' => array( '[att s]' ), + "Invalid: [att='val\\n']" => array( "[att='val\n']" ), + "Invalid: [att='val\\n" => array( "[att='val\n" ), + 'Invalid: [att="val"ix' => array( '[att="val"ix' ), + 'Invalid: [att="val"ix ' => array( '[att="val"ix ' ), + ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssClassSelector.php b/tests/phpunit/tests/html-api/wpCssClassSelector.php new file mode 100644 index 0000000000000..3328b047fa143 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssClassSelector.php @@ -0,0 +1,50 @@ +assertNull( $result ); + } else { + $this->assertSame( $expected, $result->class_name ); + $this->assertSame( $rest, substr( $input, $offset ) ); + } + } + + /** + * Data provider. + * + * @return array + */ + public static function data_class_selectors(): array { + return array( + 'valid ._-foo123' => array( '._-foo123', '_-foo123', '' ), + 'valid .foo.bar' => array( '.foo.bar', 'foo', '.bar' ), + 'escaped .\31 23' => array( '.\\31 23', '123', '' ), + 'with descendant .\31 23 div' => array( '.\\31 23 div', '123', ' div' ), + 'escape at EOF .foo\\' => array( '.foo\\', "foo\u{fffd}", '' ), + + 'not class foo' => array( 'foo' ), + 'not class #bar' => array( '#bar' ), + 'not valid .1foo' => array( '.1foo' ), + ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssComplexSelector.php b/tests/phpunit/tests/html-api/wpCssComplexSelector.php new file mode 100644 index 0000000000000..8738bb6fc32d2 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssComplexSelector.php @@ -0,0 +1,71 @@ + .child#bar[baz=quux] , rest'; + $offset = 0; + + /** @var WP_CSS_Complex_Selector|null */ + $sel = WP_CSS_Complex_Selector::parse( $input, $offset ); + + $this->assertSame( 2, count( $sel->context_selectors ) ); + + // Relative selectors should be reverse ordered. + $this->assertSame( 'el2', $sel->context_selectors[0][0]->type ); + $this->assertSame( WP_CSS_Complex_Selector::COMBINATOR_CHILD, $sel->context_selectors[0][1] ); + + $this->assertSame( 'el1', $sel->context_selectors[1][0]->type ); + $this->assertSame( WP_CSS_Complex_Selector::COMBINATOR_DESCENDANT, $sel->context_selectors[1][1] ); + + $this->assertSame( 3, count( $sel->self_selector->subclass_selectors ) ); + $this->assertNull( $sel->self_selector->type_selector ); + $this->assertSame( 'child', $sel->self_selector->subclass_selectors[0]->class_name ); + + $this->assertSame( ', rest', substr( $input, $offset ) ); + } + + /** + * @ticket 62653 + */ + public function test_parse_invalid_complex_selector() { + $input = 'el.foo#bar[baz=quux] > , rest'; + $offset = 0; + $result = WP_CSS_Complex_Selector::parse( $input, $offset ); + $this->assertNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_invalid_complex_selector_nonfinal_subclass() { + $input = 'el.foo#bar[baz=quux] > final, rest'; + $offset = 0; + $result = WP_CSS_Complex_Selector::parse( $input, $offset ); + $this->assertNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_empty_complex_selector() { + $input = ''; + $offset = 0; + $result = WP_CSS_Complex_Selector::parse( $input, $offset ); + $this->assertNull( $result ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssComplexSelectorList.php b/tests/phpunit/tests/html-api/wpCssComplexSelectorList.php new file mode 100644 index 0000000000000..b85f788f98f0d --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssComplexSelectorList.php @@ -0,0 +1,65 @@ + selector'; + $result = WP_CSS_Complex_Selector_List::from_selectors( $input ); + $this->assertNotNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_invalid_selector_list() { + $input = 'el,,'; + $result = WP_CSS_Complex_Selector_List::from_selectors( $input ); + $this->assertNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_invalid_selector_list2() { + $input = 'el!'; + $result = WP_CSS_Complex_Selector_List::from_selectors( $input ); + $this->assertNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_empty_selector_list() { + $input = " \t \t\n\r\f"; + $result = WP_CSS_Complex_Selector_List::from_selectors( $input ); + $this->assertNull( $result ); + } + + /** + * The invalid-UTF-8 scrub notice reports the called class: through this + * class it must be named WP_CSS_Complex_Selector_List::from_selectors, + * not the WP_CSS_Compound_Selector_List parent where from_selectors() + * and the scrub are implemented. The fuzzer's notice model depends on + * the per-class name. + * + * @expectedIncorrectUsage WP_CSS_Complex_Selector_List::from_selectors + */ + public function test_invalid_utf8_scrub_notice_reports_the_called_class() { + $result = WP_CSS_Complex_Selector_List::from_selectors( "el \xC2.child" ); + $this->assertNotNull( $result, 'Selector with invalid UTF-8 should parse after scrubbing.' ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssCompoundSelector.php b/tests/phpunit/tests/html-api/wpCssCompoundSelector.php new file mode 100644 index 0000000000000..8092ee049b6e1 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssCompoundSelector.php @@ -0,0 +1,44 @@ + .child'; + $offset = 0; + $sel = WP_CSS_Compound_Selector::parse( $input, $offset ); + + $this->assertSame( 'el', $sel->type_selector->type ); + $this->assertSame( 3, count( $sel->subclass_selectors ) ); + $this->assertSame( 'foo', $sel->subclass_selectors[0]->class_name, 'foo' ); + $this->assertSame( 'bar', $sel->subclass_selectors[1]->id, 'bar' ); + $this->assertSame( 'baz', $sel->subclass_selectors[2]->name, 'baz' ); + $this->assertSame( WP_CSS_Attribute_Selector::MATCH_EXACT, $sel->subclass_selectors[2]->matcher ); + $this->assertSame( 'quux', $sel->subclass_selectors[2]->value ); + $this->assertSame( ' > .child', substr( $input, $offset ) ); + } + + /** + * @ticket 62653 + */ + public function test_parse_empty_selector() { + $input = ''; + $offset = 0; + $result = WP_CSS_Compound_Selector::parse( $input, $offset ); + $this->assertNull( $result ); + $this->assertSame( 0, $offset ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssCompoundSelectorList.php b/tests/phpunit/tests/html-api/wpCssCompoundSelectorList.php new file mode 100644 index 0000000000000..33149c22ed400 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssCompoundSelectorList.php @@ -0,0 +1,143 @@ +assertNotNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_invalid_selector_list() { + $input = 'el,,'; + $result = WP_CSS_Compound_Selector_List::from_selectors( $input ); + $this->assertNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_invalid_selector_list2() { + $input = 'el!'; + $result = WP_CSS_Compound_Selector_List::from_selectors( $input ); + $this->assertNull( $result ); + } + + /** + * An escaped whitespace code point at the end of input belongs to the + * ident and must survive input normalization: `.foo\ ` is the valid + * class `foo ` (with a space), not a backslash at the end of input. + * + * @ticket 62653 + */ + public function test_parse_escaped_whitespace_at_end_of_input() { + $result = WP_CSS_Compound_Selector_List::from_selectors( '.foo\\ ' ); + $this->assertNotNull( $result ); + } + + /** + * A backslash before a newline is not a valid escape; at the end of + * input it must not be mistaken for trimmable trailing whitespace. + * + * @ticket 62653 + */ + public function test_parse_escape_before_newline_at_end_of_input_is_invalid() { + $result = WP_CSS_Compound_Selector_List::from_selectors( ".foo\\\n" ); + $this->assertNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_parse_empty_selector_list() { + $input = " \t \t\n\r\f"; + $result = WP_CSS_Compound_Selector_List::from_selectors( $input ); + $this->assertNull( $result ); + } + + /** + * @ticket 62653 + */ + public function test_unsupported_complex_selector() { + $input = 'ancestor descendant'; + $result = WP_CSS_Compound_Selector_List::from_selectors( $input ); + $this->assertNull( $result ); + } + + /** + * Selector strings are UTF-8 text: invalid byte sequences are replaced + * with U+FFFD per maximal subpart (CSS Syntax §3.2 via the WHATWG + * Encoding Standard) before parsing, so the selector parses rather than + * being rejected. The replacement is almost certainly not what the + * developer meant, so it also triggers `_doing_it_wrong()`. + * + * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors + */ + public function test_invalid_utf8_is_scrubbed_to_replacement_character_and_notifies() { + $result = WP_CSS_Compound_Selector_List::from_selectors( ".B\xFCcher" ); + $this->assertNotNull( $result, 'Selector with invalid UTF-8 should parse after scrubbing.' ); + } + + /** + * Valid UTF-8 — including a literal U+FFFD — must parse without any + * incorrect-usage notice: scrubbing is the identity function on valid + * input. + */ + public function test_valid_utf8_with_literal_replacement_character_is_not_notified() { + $result = WP_CSS_Compound_Selector_List::from_selectors( ".B\u{FFFD}cher" ); + $this->assertNotNull( $result, 'Selector containing a literal U+FFFD should parse.' ); + } + + /** + * The whole input is scrubbed uniformly, so a selector list with invalid + * bytes in one of several selectors still parses as a list. + * + * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors + */ + public function test_invalid_utf8_in_selector_list_is_scrubbed() { + $result = WP_CSS_Compound_Selector_List::from_selectors( ".ok, .B\xE2\x8Ccher" ); + $this->assertNotNull( $result, 'Selector list with invalid UTF-8 should parse after scrubbing.' ); + } + + /** + * A selector consisting of nothing but an invalid byte parses: it scrubs + * to U+FFFD, which is an ident-start code point and therefore a valid + * type selector. Surprising, but it follows from the scrub running + * before tokenization — the parser never sees the invalid byte. + * + * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors + */ + public function test_lone_invalid_byte_parses_as_replacement_character_type_selector() { + $result = WP_CSS_Compound_Selector_List::from_selectors( "\x80" ); + $this->assertNotNull( $result, 'A lone invalid byte should parse as a U+FFFD type selector.' ); + } + + /** + * The scrub notice reports the byte replacement, which happens before + * parsing — it fires even when the scrubbed selector is then rejected + * by the grammar. + * + * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors + */ + public function test_invalid_utf8_notice_fires_even_when_selector_is_rejected() { + $result = WP_CSS_Compound_Selector_List::from_selectors( "\x80 div" ); + $this->assertNull( $result, 'Descendant combinators are unsupported by the compound list; the scrubbed selector should still be rejected.' ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssIdSelector.php b/tests/phpunit/tests/html-api/wpCssIdSelector.php new file mode 100644 index 0000000000000..03694fa4456e5 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssIdSelector.php @@ -0,0 +1,51 @@ +assertNull( $result ); + } else { + $this->assertSame( $expected, $result->id ); + $this->assertSame( $rest, substr( $input, $offset ) ); + } + } + + /** + * Data provider. + * + * @return array + */ + public static function data_id_selectors(): array { + return array( + 'valid #_-foo123' => array( '#_-foo123', '_-foo123', '' ), + 'valid #foo#bar' => array( '#foo#bar', 'foo', '#bar' ), + 'escaped #\31 23' => array( '#\\31 23', '123', '' ), + 'with descendant #\31 23 div' => array( '#\\31 23 div', '123', ' div' ), + 'escape at EOF #foo\\' => array( '#foo\\', "foo\u{fffd}", '' ), + + // Invalid + 'not ID foo' => array( 'foo' ), + 'not ID .bar' => array( '.bar' ), + 'not valid #1foo' => array( '#1foo' ), + ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssSelectorParserMatcher.php b/tests/phpunit/tests/html-api/wpCssSelectorParserMatcher.php new file mode 100644 index 0000000000000..181519b3cbed3 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssSelectorParserMatcher.php @@ -0,0 +1,281 @@ +original_substitute_character = mb_substitute_character(); + mb_substitute_character( 0x2603 ); + $this->test_class = new class() extends WP_CSS_Selector_Parser_Matcher { + public function matches( $processor ): bool { + throw new Error( 'Matches called on test class.' ); + } + public static function parse( string $input, int &$offset ) { + throw new Error( 'Parse called on test class.' ); + } + + /* + * Parsing + */ + public static function test_parse_ident( string $input, int &$offset ) { + return self::parse_ident( $input, $offset ); + } + + public static function test_parse_string( string $input, int &$offset ) { + return self::parse_string( $input, $offset ); + } + + /* + * Utilities + */ + public static function test_is_ident_codepoint( string $input, int $offset ) { + return self::is_ident_codepoint( $input, $offset ); + } + + public static function test_is_ident_start_codepoint( string $input, int $offset ) { + return self::is_ident_start_codepoint( $input, $offset ); + } + }; + } + + public function tear_down(): void { + mb_substitute_character( $this->original_substitute_character ); + parent::tear_down(); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_idents(): array { + return array( + 'trailing #' => array( '_-foo123#xyz', '_-foo123', '#xyz' ), + 'trailing .' => array( '😍foo123.xyz', '😍foo123', '.xyz' ), + 'trailing " "' => array( '😍foo123 more', '😍foo123', ' more' ), + 'escaped ASCII character' => array( '\\xyz', 'xyz', '' ), + 'escape after multibyte character' => array( 'Ü\\sup', 'Üsup', '' ), + 'escape after multibyte characters' => array( 'ÜÜ\\sup', 'ÜÜsup', '' ), + 'hex escape after multibyte character' => array( 'Ü\\31 23', 'Ü123', '' ), + 'escaped space' => array( '\\ x', ' x', '' ), + 'escaped emoji' => array( '\\😍', '😍', '' ), + 'hex unicode codepoint' => array( '\\1f0a1', '🂡', '' ), + 'HEX UNICODE CODEPOINT' => array( '\\1D4B2', '𝒲', '' ), + + 'hex tab-suffixed 1' => array( "\\31\t23", '123', '' ), + 'hex newline-suffixed 1' => array( "\\31\n23", '123', '' ), + 'hex space-suffixed 1' => array( "\\31 23", '123', '' ), + 'hex tab' => array( '\\9', "\t", '' ), + 'hex a' => array( '\\61 bc', 'abc', '' ), + 'hex a max escape length' => array( '\\000061bc', 'abc', '' ), + + 'out of range replacement min' => array( '\\110000 ', "\u{fffd}", '' ), + 'out of range replacement max' => array( '\\ffffff ', "\u{fffd}", '' ), + 'leading surrogate min replacement' => array( '\\d800 ', "\u{fffd}", '' ), + 'leading surrogate max replacement' => array( '\\dbff ', "\u{fffd}", '' ), + 'trailing surrogate min replacement' => array( '\\dc00 ', "\u{fffd}", '' ), + 'trailing surrogate max replacement' => array( '\\dfff ', "\u{fffd}", '' ), + 'can start with -ident' => array( '-ident', '-ident', '' ), + 'can start with --anything' => array( '--anything', '--anything', '' ), + 'can start with ---anything' => array( '--_anything', '--_anything', '' ), + 'can start with --1anything' => array( '--1anything', '--1anything', '' ), + 'can start with -\31 23' => array( '-\31 23', '-123', '' ), + 'can start with --\31 23' => array( '--\31 23', '--123', '' ), + 'ident ends before ]' => array( 'ident]', 'ident', ']' ), + + /* + * > EOF + * > This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�). + * + * https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point + */ + 'escape at EOF' => array( 'foo\\', "foo\u{fffd}", '' ), + 'lone escape at EOF' => array( '\\', "\u{fffd}", '' ), + 'hyphen then escape at EOF' => array( '-\\', "-\u{fffd}", '' ), + + // Identity escapes of multibyte characters, by UTF-8 sequence length. + 'escaped 2-byte character' => array( "\\\u{FC}z", "\u{FC}z", '' ), + 'escaped 3-byte character' => array( "\\\u{270F}z", "\u{270F}z", '' ), + 'escaped 4-byte character' => array( "\\\u{1F0A1}z", "\u{1F0A1}z", '' ), + 'escaped 2-byte character at EOF' => array( "a\\\u{FC}", "a\u{FC}", '' ), + 'escaped 3-byte character at EOF' => array( "a\\\u{270F}", "a\u{270F}", '' ), + 'escaped 4-byte character at EOF' => array( "a\\\u{1F0A1}", "a\u{1F0A1}", '' ), + + /* + * An escaped NUL byte passes through this low-level helper unchanged. + * This is unreachable through the public selector API, where + * normalize_selector_input() replaces NUL with U+FFFD before parsing. + */ + 'escaped NUL byte' => array( "a\\\x00z", "a\x00z", '' ), + + /* + * Identity escapes of invalid UTF-8 byte sequences. + * + * These inputs are not valid UTF-8, which can only reach the parser + * through a direct `parse()` call: the public `from_selectors()` API + * replaces invalid byte sequences with U+FFFD before parsing. On + * this un-normalized path the escape decodes the maximal subpart of + * the invalid sequence (CSS Syntax §3.2 via the WHATWG Encoding + * Standard) to a single U+FFFD — independent of the + * `mb_substitute_character()` setting, which set_up() pins to ☃ + * precisely to prove that independence. Invalid bytes *after* the + * escaped subpart are not escaped; they pass through this low-level + * helper raw, exactly as unescaped invalid bytes do (the 0xAF, + * 0xA0 0x80, and 0x90 0x80 0x80 tails below). + */ + 'escaped lone continuation byte' => array( "a\\\x80z", "a\u{FFFD}z", '' ), + 'escaped overlong lead 0xC0' => array( "a\\\xC0\xAFz", "a\u{FFFD}\xAFz", '' ), + 'escaped invalid lead 0xF5' => array( "a\\\xF5z", "a\u{FFFD}z", '' ), + 'escaped truncated 3-byte sequence' => array( "a\\\xE2\x80z", "a\u{FFFD}z", '' ), + 'escaped truncated 4-byte at EOF' => array( "a\\\xF0\x9F\x82", "a\u{FFFD}", '' ), + 'escaped UTF-8-encoded surrogate' => array( "a\\\xED\xA0\x80z", "a\u{FFFD}\xA0\x80z", '' ), + 'escaped sequence above U+10FFFF' => array( "a\\\xF4\x90\x80\x80z", "a\u{FFFD}\x90\x80\x80z", '' ), + + // Invalid + 'Invalid: (empty string)' => array( '' ), + 'Invalid: bad start >' => array( '>ident' ), + 'Invalid: bad start [' => array( '[ident' ), + 'Invalid: bad start #' => array( '#ident' ), + 'Invalid: bad start " "' => array( ' ident' ), + 'Invalid: bad start 1' => array( '1ident' ), + 'Invalid: bad start -1' => array( '-1ident' ), + 'Invalid: bad start -' => array( '-' ), + ); + } + + /** + * @ticket 62653 + */ + public function test_is_ident_and_is_ident_start() { + $this->assertFalse( $this->test_class::test_is_ident_codepoint( '[', 0 ) ); + $this->assertFalse( $this->test_class::test_is_ident_codepoint( ']', 0 ) ); + $this->assertFalse( $this->test_class::test_is_ident_start_codepoint( '[', 0 ) ); + $this->assertFalse( $this->test_class::test_is_ident_start_codepoint( ']', 0 ) ); + } + + /** + * @ticket 62653 + * + * @dataProvider data_idents + */ + public function test_parse_ident( string $input, ?string $expected = null, ?string $rest = null ) { + + $offset = 0; + $result = $this->test_class::test_parse_ident( $input, $offset ); + if ( null === $expected ) { + $this->assertNull( $result ); + } else { + $this->assertSame( $expected, $result, 'Ident did not match.' ); + $this->assertSame( $rest, substr( $input, $offset ), 'Offset was not updated correctly.' ); + } + } + + /** + * The rest-of-input assertion above cannot distinguish an offset at the end + * of the input from one past it (`substr()` returns '' for both), so the + * offset arithmetic of the invalid-byte decode is pinned explicitly here: + * the escape consumes exactly the 1-byte maximal subpart and the following + * `z`, leaving the offset at — never past — the end of the input. (The + * previous `mb_substr()`-based decode advanced by the byte length of the + * substitute character and overran the end by one byte under the ☃ canary.) + */ + public function test_parse_ident_escaped_invalid_byte_does_not_overrun_offset() { + $input = "a\\\x80z"; + $offset = 0; + $result = $this->test_class::test_parse_ident( $input, $offset ); + + $this->assertSame( "a\u{FFFD}z", $result, 'Ident did not match.' ); + $this->assertSame( strlen( $input ), $offset, 'Offset should stop exactly at the end of input.' ); + } + + /** + * @ticket 62653 + * + * @dataProvider data_strings + */ + public function test_parse_string( string $input, ?string $expected = null, ?string $rest = null ) { + $offset = 0; + $result = $this->test_class::test_parse_string( $input, $offset ); + if ( null === $expected ) { + $this->assertNull( $result ); + } else { + $this->assertSame( $expected, $result, 'String did not match.' ); + $this->assertSame( $rest, substr( $input, $offset ), 'Offset was not updated correctly.' ); + } + } + + /** + * Data provider. + * + * @return array + */ + public static function data_strings(): array { + return array( + '"foo"' => array( '"foo"', 'foo', '' ), + '"foo"after' => array( '"foo"after', 'foo', 'after' ), + '"foo""two"' => array( '"foo""two"', 'foo', '"two"' ), + '"foo"\'two\'' => array( '"foo"\'two\'', 'foo', "'two'" ), + + "'foo'" => array( "'foo'", 'foo', '' ), + "'foo'after" => array( "'foo'after", 'foo', 'after' ), + "'foo'\"two\"" => array( "'foo'\"two\"", 'foo', '"two"' ), + "'foo''two'" => array( "'foo''two'", 'foo', "'two'" ), + + "'foo\\nbar'" => array( "'foo\\\nbar'", 'foobar', '' ), + "'foo\\31 23'" => array( "'foo\\31 23'", 'foo123', '' ), + "'Ü\\sup'" => array( "'Ü\\sup'", 'Üsup', '' ), + "'foo\\31\\n23'" => array( "'foo\\31\n23'", 'foo123', '' ), + "'foo\\31\\t23'" => array( "'foo\\31\t23'", 'foo123', '' ), + "'foo\\00003123'" => array( "'foo\\00003123'", 'foo123', '' ), + + "'foo\\" => array( "'foo\\", 'foo', '' ), + + /* + * Invalid UTF-8 in string context, reachable only via a direct + * parse() call ( from_selectors() scrubs first ): an escaped + * invalid byte decodes its maximal subpart to U+FFFD, exactly as + * in ident context; raw invalid bytes pass through unexamined. + */ + 'string with escaped invalid byte' => array( "'a\\\xC0z'", "a\u{FFFD}z", '' ), + 'string with raw invalid byte' => array( "'a\xC0z'", "a\xC0z", '' ), + + '"' => array( '"', '', '' ), + '"\\"' => array( '"\\"', '"', '' ), + '"missing close' => array( '"missing close', 'missing close', '' ), + + // Invalid + 'Invalid: (empty string)' => array( '' ), + 'Invalid: .foo' => array( '.foo' ), + 'Invalid: #foo' => array( '#foo' ), + "Invalid: 'newline\\n'" => array( "'newline\n'" ), + 'Invalid: foo' => array( 'foo' ), + ); + } +} diff --git a/tests/phpunit/tests/html-api/wpCssTypeSelector.php b/tests/phpunit/tests/html-api/wpCssTypeSelector.php new file mode 100644 index 0000000000000..94ae49bff474a --- /dev/null +++ b/tests/phpunit/tests/html-api/wpCssTypeSelector.php @@ -0,0 +1,52 @@ +assertNull( $result ); + } else { + $this->assertSame( $expected, $result->type ); + $this->assertSame( $rest, substr( $input, $offset ) ); + } + } + + /** + * Data provider. + * + * @return array + */ + public static function data_type_selectors(): array { + return array( + 'any *' => array( '* .class', '*', ' .class' ), + 'a' => array( 'a', 'a', '' ), + 'div.class' => array( 'div.class', 'div', '.class' ), + 'custom-type#id' => array( 'custom-type#id', 'custom-type', '#id' ), + 'escape at EOF foo\\' => array( 'foo\\', "foo\u{fffd}", '' ), + + // Invalid + 'Invalid: (empty string)' => array( '' ), + 'Invalid: #id' => array( '#id' ), + 'Invalid: .class' => array( '.class' ), + 'Invalid: [attr]' => array( '[attr]' ), + ); + } +} diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-select.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-select.php new file mode 100644 index 0000000000000..fcb1acf3fa7d6 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-select.php @@ -0,0 +1,110 @@ +' ); + $this->assertFalse( $processor->select( 'div' ) ); + } + + /** + * @ticket 62653 + * + * @dataProvider data_selectors + */ + public function test_selects_all_matches( string $html, string $selector, int $match_count ) { + $processor = WP_HTML_Processor::create_full_parser( $html ); + $count = 0; + while ( $processor->select( $selector ) ) { + $breadcrumb_string = implode( ', ', $processor->get_breadcrumbs() ); + $this->assertTrue( + $processor->get_attribute( 'match' ), + "Matched unexpected tag {$processor->get_tag()} @ {$breadcrumb_string}" + ); + ++$count; + } + $this->assertSame( $match_count, $count, 'Did not match expected number of tags.' ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_selectors(): array { + return array( + 'any' => array( '

', '*', 5 ), + 'quirks mode ID' => array( '

In quirks mode, ID matching is case-insensitive.', '#id', 2 ), + 'quirks mode class' => array( '

In quirks mode, class matching is case-insensitive.', '.c', 2 ), + 'no-quirks mode ID' => array( '

In no-quirks mode, ID matching is case-sensitive.', '#id', 1 ), + 'no-quirks mode class' => array( '

In no-quirks mode, class matching is case-sensitive.', '.c', 1 ), + 'any descendant' => array( '

', 'section *', 4 ), + 'any child matches all children' => array( '

', 'section > *', 2 ), + + 'multiple complex selectors' => array( '

', 'section > div p > i', 1 ), + + // Per Selectors-4, the substring matchers ^= $= *= match nothing when the value + // is empty. ~= also matches nothing: an empty string is never a list item. + 'empty value ^= matches nothing' => array( '', '[x^=""]', 0 ), + 'empty value $= matches nothing' => array( '', '[x$=""]', 0 ), + 'empty value *= matches nothing' => array( '', '[x*=""]', 0 ), + 'empty value ~= matches nothing' => array( '', '[x~=""]', 0 ), + 'empty value ^= i matches nothing' => array( '', '[x^="" i]', 0 ), + 'empty value = matches empty' => array( '', '[x=""]', 1 ), + 'empty value |= matches empty or hyphen-prefixed' => array( '', '[x|=""]', 2 ), + + /* + * HTML's case-insensitive attribute value list applies to + * "an HTML element in an HTML document": a foreign element with + * the same attribute name keeps case-sensitive matching. + * ( Chromium applies the list to foreign elements as well, + * diverging from the HTML specification here. ) + * + * https://html.spec.whatwg.org/multipage/semantics-other.html#case-sensitivity-of-selectors + */ + 'HTML-namespace-only attribute case-insensitivity' => array( '', '[type=TEXT]', 1 ), + ); + } + + /** + * @ticket 62653 + * + * @expectedIncorrectUsage WP_HTML_Processor::select + * + * @dataProvider data_invalid_selectors + */ + public function test_invalid_selector( string $selector ) { + $processor = WP_HTML_Processor::create_fragment( 'irrelevant' ); + $this->assertFalse( $processor->select( $selector ) ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_invalid_selectors(): array { + return array( + 'invalid selector' => array( '[invalid!selector]' ), + + // The class selectors below are not allowed in non-final position. + 'unsupported child selector' => array( '.parent > .child' ), + 'unsupported descendant selector' => array( '.ancestor .descendant' ), + + // Unsupported combinators + 'unsupported next sibling selector' => array( 'p + p' ), + 'unsupported subsequent sibling selector' => array( 'p ~ p' ), + ); + } +} diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-select.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-select.php new file mode 100644 index 0000000000000..96bb8e1b4457d --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-select.php @@ -0,0 +1,214 @@ +' ); + $this->assertFalse( $processor->select( 'div' ) ); + } + + /** + * @ticket 62653 + * + * @dataProvider data_selectors + */ + public function test_select( string $html, string $selector, int $match_count ) { + $processor = new WP_HTML_Tag_Processor( $html ); + $count = 0; + while ( $processor->select( $selector ) ) { + $this->assertTrue( + $processor->get_attribute( 'match' ), + "Matched unexpected tag {$processor->get_tag()}" + ); + ++$count; + } + $this->assertSame( $match_count, $count, 'Did not match expected number of tags.' ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_selectors(): array { + return array( + 'simple type' => array( '

', 'div', 2 ), + 'any type' => array( '
', '*', 2 ), + 'simple class' => array( '
', '.x', 2 ), + 'simple id' => array( '
', '#x', 2 ), + + 'attribute presence' => array( '
', '[att]', 2 ), + 'attribute empty string match' => array( '
', '[att=""]', 2 ), + 'attribute value' => array( '

', '[att=val]', 2 ), + 'attribute quoted value' => array( '

', '[att="::"]', 2 ), + 'attribute case insensitive' => array( '

', '[att="VAL"i]', 2 ), + 'attribute case sensitive mod' => array( '

', '[att="val"s]', 2 ), + + 'attribute one of' => array( '

', '[att~="b"]', 3 ), + 'attribute one of insensitive' => array( '

', '[att~="b"i]', 1 ), + 'attribute one of mod sensitive' => array( '
', '[att~="b"s]', 1 ), + 'attribute one of whitespace cases' => array( "
", '[att~="b"]', 1 ), + + 'attribute with-hyphen' => array( '

', '[att|="special"]', 2 ), + 'attribute with-hyphen insensitive' => array( '

', '[att|="special" i]', 2 ), + 'attribute with-hyphen sensitive mod' => array( '

', '[att|="special"s]', 1 ), + + 'attribute prefixed' => array( '

', '[att^="p"]', 2 ), + 'attribute prefixed insensitive' => array( '

', '[att^="p"i]', 1 ), + 'attribute prefixed sensitive mod' => array( '

', '[att^="p"s]', 1 ), + + 'attribute suffixed' => array( '

', '[att$="x"]', 2 ), + 'attribute suffixed insensitive' => array( '

', '[att$="x"i]', 1 ), + 'attribute suffixed sensitive mod' => array( '

', '[att$="x"s]', 1 ), + + 'attribute contains' => array( '

', '[att*="x"]', 2 ), + 'attribute contains insensitive' => array( '

', '[att*="x"i]', 1 ), + 'attribute contains sensitive mod' => array( '

', '[att*="x"s]', 1 ), + + /* + * An escaped trailing whitespace code point is part of the ident, + * not trailing whitespace: `.foo\ ` is the class `foo ` (with a + * space). Class attribute values are whitespace-separated token + * lists, so such a class can never match. It must NOT be confused + * with a backslash at the end of input, which decodes to U+FFFD. + */ + 'escaped space at end' => array( "

", '.foo\\ ', 0 ), + 'escaped tab at end' => array( "
", ".foo\\\t", 0 ), + + /* + * The end of input closes an open attribute selector ( and an + * unterminated string ): tokenization auto-closes simple blocks + * at EOF. + */ + 'EOF-truncated attribute presence' => array( '
', '[att', 1 ), + 'EOF-truncated attribute value' => array( '
', '[att=val', 1 ), + 'EOF-truncated quoted value' => array( '
', '[att="a b', 1 ), + 'EOF-truncated with modifier' => array( '
', '[att=val i', 1 ), + + /* + * HTML defines a set of attributes whose values must match ASCII + * case-insensitively in selectors when no modifier is present. + * An explicit `s` modifier still forces case-sensitive matching. + * Attributes outside the list stay case-sensitive by default. + * + * https://html.spec.whatwg.org/multipage/semantics-other.html#case-sensitivity-of-selectors + */ + 'HTML insensitive attribute =' => array( '', '[type=TEXT]', 2 ), + 'HTML insensitive attribute ~=' => array( '', '[rel~=nofollow]', 1 ), + 'HTML insensitive attribute ^=' => array( '', '[media^=screen]', 1 ), + 'HTML insensitive attribute |=' => array( '', '[hreflang|=en]', 1 ), + 'HTML insensitive attribute s mod' => array( '', '[type=text s]', 1 ), + 'HTML insensitive attribute i mod' => array( '', '[type=text i]', 2 ), + 'unlisted attribute stays sensitive' => array( '', '[data-type=TEXT]', 1 ), + 'listed attribute name is matched case-insensitively in the list' => array( '', '[TYPE=TEXT]', 1 ), + + 'list' => array( '

', 'a, p, .class, #id, [att]', 2 ), + 'compound' => array( '

', 'custom-el[att="bar"][ fruit ~= "banana" i]', 1 ), + ); + } + + /** + * @ticket 62653 + * + * @expectedIncorrectUsage WP_HTML_Tag_Processor::select + * + * @dataProvider data_invalid_selectors + */ + public function test_invalid_selector( string $selector ) { + $processor = new WP_HTML_Tag_Processor( 'irrelevant' ); + $this->assertFalse( $processor->select( $selector ) ); + } + + /** + * Data provider. + * + * @return array + */ + public static function data_invalid_selectors(): array { + return array( + 'complex descendant' => array( 'div *' ), + 'complex child' => array( 'div > *' ), + 'invalid selector' => array( '[invalid!selector]' ), + + /* + * A backslash before a newline at the end of input is not a valid + * escape and is not trailing whitespace: the selector is invalid. + * The CR and FF variants are normalized to a newline before + * tokenizing. + */ + 'escape before newline at end' => array( ".foo\\\n" ), + 'escape before CR at end' => array( ".foo\\\r" ), + 'escape before FF at end' => array( ".foo\\\f" ), + + /* + * EOF auto-closes an open attribute selector block, but + * grammar-level truncation is still invalid. + */ + 'truncated matcher without value' => array( '[a=' ), + 'truncated half matcher' => array( '[a~' ), + 'lone open bracket' => array( '[' ), + ); + } + + /** + * Selector strings are UTF-8 text: invalid byte sequences are replaced + * with U+FFFD per maximal subpart before parsing. A selector containing + * invalid bytes therefore matches a literal U+FFFD in the document, and + * an identity escape of an invalid byte is equivalent to the same byte + * unescaped — both are scrubbed before tokenization. + * + * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors + */ + public function test_select_scrubbed_selector_matches_replacement_character() { + $html = "
"; + + $processor = new WP_HTML_Tag_Processor( $html ); + $this->assertTrue( + $processor->select( ".a\xC0b" ), + 'Scrubbed selector should match the replacement character in the document.' + ); + + $processor = new WP_HTML_Tag_Processor( $html ); + $this->assertTrue( + $processor->select( ".a\\\xC0b" ), + 'An identity escape of an invalid byte should be equivalent to the unescaped byte.' + ); + } + + /** + * A selector containing invalid bytes can never match those same raw + * bytes in a document: the selector side is scrubbed to U+FFFD while + * the Tag Processor reports raw document bytes untouched. + * + * This pins a deliberate, documented divergence. If the HTML API value + * getters (get_attribute(), class_list(), …) are ever changed to scrub + * invalid UTF-8 in their return values, both sides become U+FFFD and + * this case flips to a match — update this expectation in the same + * change. + * + * The selector byte (0xC1) is unique within this file on purpose: + * select() memoizes the most recently parsed selector string, so the + * scrub notice only fires when this test's selector was not already + * parsed by an earlier test. A unique selector string guarantees a + * fresh parse regardless of test order. + * + * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors + */ + public function test_select_scrubbed_selector_does_not_match_raw_invalid_document_bytes() { + $processor = new WP_HTML_Tag_Processor( "
" ); + $this->assertFalse( + $processor->select( ".a\xC1b" ), + 'Scrubbed selector should not match raw invalid bytes in the document.' + ); + } +}