.
+ *
+ * @link https://www.w3.org/TR/css-syntax-3/#ident-start-code-point
+ *
+ * @param string $input The input string.
+ * @param int $offset The byte offset in the string.
+ * @return bool True if the next codepoint is an ident start code point, otherwise false.
+ */
+ final protected static function is_ident_start_codepoint( string $input, int $offset ): bool {
+ return (
+ '_' === $input[ $offset ] ||
+ ( 'a' <= $input[ $offset ] && $input[ $offset ] <= 'z' ) ||
+ ( 'A' <= $input[ $offset ] && $input[ $offset ] <= 'Z' ) ||
+ ord( $input[ $offset ] ) > 0x7F
+ );
+ }
+
+ /**
+ * Checks if the next code point is an "ident code point."
+ *
+ * Caution! This method does not do any bounds checking, it should not be passed
+ * a string with an offset that is out of bounds.
+ *
+ * > ident code point
+ * > An ident-start code point, a digit, or U+002D HYPHEN-MINUS (-).
+ * > digit
+ * > A code point between U+0030 DIGIT ZERO (0) and U+0039 DIGIT NINE (9) inclusive.
+ *
+ * @link https://www.w3.org/TR/css-syntax-3/#ident-code-point
+ *
+ * @param string $input The input string.
+ * @param int $offset The byte offset in the string.
+ * @return bool True if the next codepoint is an ident code point, otherwise false.
+ */
+ final protected static function is_ident_codepoint( string $input, int $offset ): bool {
+ return '-' === $input[ $offset ] ||
+ ( '0' <= $input[ $offset ] && $input[ $offset ] <= '9' ) ||
+ self::is_ident_start_codepoint( $input, $offset );
+ }
+
+ /**
+ * Checks if three code points would start an ident sequence.
+ *
+ * > 4.3.9. Check if three code points would start an ident sequence
+ * > This section describes how to check if three code points would start an ident sequence. The algorithm described here can be called explicitly with three code points, or can be called with the input stream itself. In the latter case, the three code points in question are the current input code point and the next two input code points, in that order.
+ * >
+ * > Note: This algorithm will not consume any additional code points.
+ * >
+ * > Look at the first code point:
+ * >
+ * > U+002D HYPHEN-MINUS
+ * > If the second code point is an ident-start code point or a U+002D HYPHEN-MINUS, or the second and third code points are a valid escape, return true. Otherwise, return false.
+ * > ident-start code point
+ * > Return true.
+ * > U+005C REVERSE SOLIDUS (\)
+ * > If the first and second code points are a valid escape, return true. Otherwise, return false.
+ * > anything else
+ * > Return false.
+ *
+ * @link https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier
+ *
+ * @param string $input The input string.
+ * @param int $offset The byte offset in the string.
+ * @return bool True if the next three codepoints would start an ident sequence, otherwise false.
+ */
+ final protected static function check_if_three_code_points_would_start_an_ident_sequence( string $input, int $offset ): bool {
+ if ( $offset >= strlen( $input ) ) {
+ return false;
+ }
+
+ // > U+005C REVERSE SOLIDUS (\)
+ if ( '\\' === $input[ $offset ] ) {
+ return self::next_two_are_valid_escape( $input, $offset );
+ }
+
+ // > U+002D HYPHEN-MINUS
+ if ( '-' === $input[ $offset ] ) {
+ $after_initial_hyphen_minus_offset = $offset + 1;
+ if ( $after_initial_hyphen_minus_offset >= strlen( $input ) ) {
+ return false;
+ }
+
+ // > If the second code point is… U+002D HYPHEN-MINUS… return true
+ if ( '-' === $input[ $after_initial_hyphen_minus_offset ] ) {
+ return true;
+ }
+
+ // > If the second and third code points are a valid escape… return true.
+ if ( self::next_two_are_valid_escape( $input, $after_initial_hyphen_minus_offset ) ) {
+ return true;
+ }
+
+ // > If the second code point is an ident-start code point… return true.
+ if ( self::is_ident_start_codepoint( $input, $after_initial_hyphen_minus_offset ) ) {
+ return true;
+ }
+
+ // > Otherwise, return false.
+ return false;
+ }
+
+ // > ident-start code point
+ // > Return true.
+ // > anything else
+ // > Return false.
+ return self::is_ident_start_codepoint( $input, $offset );
+ }
+
+ /**
+ * Normalizes selector input for processing: decodes the byte stream as
+ * UTF-8 ( replacing ill-formed sequences with U+FFFD ), then filters the
+ * code points per the input-preprocessing rules.
+ *
+ * @see https://www.w3.org/TR/css-syntax-3/#input-byte-stream
+ * @see https://www.w3.org/TR/css-syntax-3/#input-preprocessing
+ *
+ * @param string $input The selector string.
+ * @return string The normalized selector string.
+ */
+ final protected static function normalize_selector_input( string $input ): string {
+ /*
+ * > The input byte stream defines the byte stream that comprises a style sheet.
+ * > To decode bytes into a stream of code points…
+ *
+ * Selector strings are UTF-8 text. Decoding replaces each maximal
+ * subpart of an ill-formed byte sequence with U+FFFD REPLACEMENT
+ * CHARACTER (�), per the WHATWG Encoding Standard's UTF-8 decoder.
+ * The replaced selector is unlikely to match the elements the
+ * developer intended, so the replacement also reports a notice.
+ *
+ * https://www.w3.org/TR/css-syntax-3/#input-byte-stream
+ */
+ $scrubbed = wp_scrub_utf8( $input );
+ if ( $scrubbed !== $input ) {
+ _doing_it_wrong(
+ get_called_class() . '::from_selectors',
+ 'Selector strings must be valid UTF-8: ill-formed byte sequences were replaced with U+FFFD (�), which is unlikely to match the intended elements.',
+ '{WP_VERSION}'
+ );
+ $input = $scrubbed;
+ }
+
+ /*
+ * > A selector string is a list of one or more complex selectors ([SELECTORS4], section 3.1) that may be surrounded by whitespace…
+ *
+ * This list includes \f.
+ *
+ * Only leading whitespace is removed here. Trailing whitespace may be
+ * significant: a backslash may escape a final whitespace code point
+ * into an ident (`.foo\ ` is the class `foo `), and a backslash
+ * before a final newline is an invalid escape, while a backslash at
+ * the end of input is a valid escape that decodes to U+FFFD. The
+ * selector grammar consumes insignificant trailing whitespace itself.
+ */
+ $input = ltrim( $input, " \t\r\n\f" );
+
+ /*
+ * > The input stream consists of the filtered code points pushed into it as the input byte stream is decoded.
+ * >
+ * > To filter code points from a stream of (unfiltered) code points input:
+ * > Replace any U+000D CARRIAGE RETURN (CR) code points, U+000C FORM FEED (FF) code points, or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF) in input by a single U+000A LINE FEED (LF) code point.
+ * > Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT CHARACTER (�).
+ *
+ * https://www.w3.org/TR/css-syntax-3/#input-preprocessing
+ */
+ $input = str_replace( array( "\r\n" ), "\n", $input );
+ $input = str_replace( array( "\r", "\f" ), "\n", $input );
+ $input = str_replace( "\0", "\u{FFFD}", $input );
+
+ return $input;
+ }
+}
diff --git a/src/wp-includes/html-api/css/class-wp-css-type-selector.php b/src/wp-includes/html-api/css/class-wp-css-type-selector.php
new file mode 100644
index 0000000000000..c7c7baa2d5508
--- /dev/null
+++ b/src/wp-includes/html-api/css/class-wp-css-type-selector.php
@@ -0,0 +1,90 @@
+type = $type;
+ }
+
+ /**
+ * Determines if the processor's current position matches the selector.
+ *
+ * @param WP_HTML_Tag_Processor $processor The processor.
+ * @return bool True if the processor's current position matches the selector.
+ */
+ public function matches( WP_HTML_Tag_Processor $processor ): bool {
+ $tag_name = $processor->get_tag();
+ if ( null === $tag_name ) {
+ return false;
+ }
+ return $this->matches_tag( $tag_name );
+ }
+
+ /**
+ * Checks whether the selector matches the provided tag name.
+ *
+ * @param string $tag_name
+ * @return bool
+ */
+ public function matches_tag( string $tag_name ): bool {
+ if ( '*' === $this->type ) {
+ return true;
+ }
+ return 0 === strcasecmp( $tag_name, $this->type );
+ }
+
+ /**
+ * Parses a selector string to create a selector instance.
+ *
+ * To create an instance of this class, use the {@see WP_CSS_Compound_Selector_List::from_selectors()} method.
+ *
+ * @param string $input The selector string.
+ * @param int $offset The offset into the string. The offset is passed by reference and
+ * will be updated if the parse is successful.
+ * @return static|null The selector instance, or null if the parse was unsuccessful.
+ */
+ public static function parse( string $input, int &$offset ) {
+ if ( $offset >= strlen( $input ) ) {
+ return null;
+ }
+
+ if ( '*' === $input[ $offset ] ) {
+ ++$offset;
+ return new WP_CSS_Type_Selector( '*' );
+ }
+
+ $result = self::parse_ident( $input, $offset );
+ if ( null === $result ) {
+ return null;
+ }
+
+ return new self( $result );
+ }
+}
diff --git a/src/wp-settings.php b/src/wp-settings.php
index ef5c7784ee561..e9ff3af23a096 100644
--- a/src/wp-settings.php
+++ b/src/wp-settings.php
@@ -278,6 +278,15 @@
require ABSPATH . WPINC . '/html-api/class-wp-html-processor-state.php';
require ABSPATH . WPINC . '/html-api/class-wp-html-processor.php';
require ABSPATH . WPINC . '/class-wp-block-processor.php';
+require ABSPATH . WPINC . '/html-api/css/class-wp-css-selector-parser-matcher.php';
+require ABSPATH . WPINC . '/html-api/css/class-wp-css-attribute-selector.php';
+require ABSPATH . WPINC . '/html-api/css/class-wp-css-class-selector.php';
+require ABSPATH . WPINC . '/html-api/css/class-wp-css-id-selector.php';
+require ABSPATH . WPINC . '/html-api/css/class-wp-css-type-selector.php';
+require ABSPATH . WPINC . '/html-api/css/class-wp-css-compound-selector.php';
+require ABSPATH . WPINC . '/html-api/css/class-wp-css-complex-selector.php';
+require ABSPATH . WPINC . '/html-api/css/class-wp-css-compound-selector-list.php';
+require ABSPATH . WPINC . '/html-api/css/class-wp-css-complex-selector-list.php';
require ABSPATH . WPINC . '/class-wp-http.php';
require ABSPATH . WPINC . '/class-wp-http-streams.php';
require ABSPATH . WPINC . '/class-wp-http-curl.php';
diff --git a/tests/phpunit/tests/html-api/wpCssAttributeSelector.php b/tests/phpunit/tests/html-api/wpCssAttributeSelector.php
new file mode 100644
index 0000000000000..99051f2cc971c
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpCssAttributeSelector.php
@@ -0,0 +1,123 @@
+assertNull( $result );
+ } else {
+ $this->assertNotNull( $result, "Failed to parse attribute selector: {$input}" );
+ $this->assertSame( $expected_name, $result->name );
+ $this->assertSame( $expected_matcher, $result->matcher );
+ $this->assertSame( $expected_value, $result->value );
+ $this->assertSame( $expected_modifier, $result->modifier );
+ $this->assertSame( $rest, substr( $input, $offset ) );
+ }
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array
+ */
+ public static function data_attribute_selectors(): array {
+ return array(
+ '[href]' => array( '[href]', 'href', null, null, null, '' ),
+ '[href] type' => array( '[href] type', 'href', null, null, null, ' type' ),
+ '[href]#id' => array( '[href]#id', 'href', null, null, null, '#id' ),
+ '[href].class' => array( '[href].class', 'href', null, null, null, '.class' ),
+ '[href][href2]' => array( '[href][href2]', 'href', null, null, null, '[href2]' ),
+ '[\n href\t\r]' => array( "[\n href\t\r]", 'href', null, null, null, '' ),
+ '[href=foo]' => array( '[href=foo]', 'href', WP_CSS_Attribute_Selector::MATCH_EXACT, 'foo', null, '' ),
+ '[a=b]' => array( '[a=b]', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ),
+ '[href \n = bar ]' => array( "[href \n = bar ]", 'href', WP_CSS_Attribute_Selector::MATCH_EXACT, 'bar', null, '' ),
+ '[href \n ^= baz ]' => array( "[href \n ^= baz ]", 'href', WP_CSS_Attribute_Selector::MATCH_PREFIXED_BY, 'baz', null, '' ),
+
+ '[match $= insensitive i]' => array( '[match $= insensitive i]', 'match', WP_CSS_Attribute_Selector::MATCH_SUFFIXED_BY, 'insensitive', WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE, '' ),
+ '[match|=sensitive s]' => array( '[match|=sensitive s]', 'match', WP_CSS_Attribute_Selector::MATCH_EXACT_OR_HYPHEN_SUFFIXED, 'sensitive', WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE, '' ),
+ '[att=val I]' => array( '[att=val I]', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE, '' ),
+ '[att=val S]' => array( '[att=val S]', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE, '' ),
+
+ '[match~="quoted[][]"]' => array( '[match~="quoted[][]"]', 'match', WP_CSS_Attribute_Selector::MATCH_ONE_OF_EXACT, 'quoted[][]', null, '' ),
+ "[match$='quoted!{}']" => array( "[match$='quoted!{}']", 'match', WP_CSS_Attribute_Selector::MATCH_SUFFIXED_BY, 'quoted!{}', null, '' ),
+ "[match*='quoted's]" => array( "[match*='quoted's]", 'match', WP_CSS_Attribute_Selector::MATCH_CONTAINS, 'quoted', WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE, '' ),
+
+ '[escape-nl="foo\\nbar"]' => array( "[escape-nl='foo\\\nbar']", 'escape-nl', WP_CSS_Attribute_Selector::MATCH_EXACT, 'foobar', null, '' ),
+ '[escape-seq="\\31 23"]' => array( "[escape-seq='\\31 23']", 'escape-seq', WP_CSS_Attribute_Selector::MATCH_EXACT, '123', null, '' ),
+
+ /*
+ * The end of input closes an open attribute selector: tokenization
+ * auto-closes unterminated simple blocks (and strings) at EOF.
+ *
+ * https://www.w3.org/TR/css-syntax-3/#consume-simple-block
+ */
+ 'EOF [foo' => array( '[foo', 'foo', null, null, null, '' ),
+ 'EOF [ \n foo' => array( "[ \n foo", 'foo', null, null, null, '' ),
+ 'EOF [foo ' => array( '[foo ', 'foo', null, null, null, '' ),
+ 'EOF [a=b' => array( '[a=b', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ),
+ 'EOF [att=val ' => array( '[att=val ', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', null, '' ),
+ 'EOF [a="b' => array( '[a="b', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ),
+ "EOF [a='b" => array( "[a='b", 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ),
+ 'EOF [a="b\\' => array( '[a="b\\', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, 'b', null, '' ),
+ 'EOF [a=b\\' => array( '[a=b\\', 'a', WP_CSS_Attribute_Selector::MATCH_EXACT, "b\u{FFFD}", null, '' ),
+ 'EOF [a^=b' => array( '[a^=b', 'a', WP_CSS_Attribute_Selector::MATCH_PREFIXED_BY, 'b', null, '' ),
+ 'EOF [att=val i' => array( '[att=val i', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE, '' ),
+ 'EOF [att=val i ' => array( '[att=val i ', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_INSENSITIVE, '' ),
+ 'EOF [att="val"s' => array( '[att="val"s', 'att', WP_CSS_Attribute_Selector::MATCH_EXACT, 'val', WP_CSS_Attribute_Selector::MODIFIER_CASE_SENSITIVE, '' ),
+
+ // Invalid
+ 'Invalid: (empty string)' => array( '' ),
+ 'Invalid: foo' => array( 'foo' ),
+ 'Invalid: [' => array( '[' ),
+ 'Invalid: [ ' => array( '[ ' ),
+ 'Invalid: [a=' => array( '[a=' ),
+ 'Invalid: [a= ' => array( '[a= ' ),
+ 'Invalid: [a~' => array( '[a~' ),
+ 'Invalid: [a=b x' => array( '[a=b x' ),
+ 'Invalid: [a i' => array( '[a i' ),
+ 'Invalid: [#foo]' => array( '[#foo]' ),
+ 'Invalid: [*|*]' => array( '[*|*]' ),
+ 'Invalid: [ns|*]' => array( '[ns|*]' ),
+ 'Invalid: [* |att]' => array( '[* |att]' ),
+ 'Invalid: [*| att]' => array( '[*| att]' ),
+ 'Invalid: [att * =]' => array( '[att * =]' ),
+ 'Invalid: [att+=val]' => array( '[att+=val]' ),
+ 'Invalid: [a=]' => array( '[a=]' ),
+ 'Invalid: [a~=]' => array( '[a~=]' ),
+ 'Invalid: [a==b]' => array( '[a==b]' ),
+ 'Invalid: [a=1]' => array( '[a=1]' ),
+ 'Invalid: [a=1' => array( '[a=1' ),
+ 'Invalid: [att i]' => array( '[att i]' ),
+ 'Invalid: [att s]' => array( '[att s]' ),
+ "Invalid: [att='val\\n']" => array( "[att='val\n']" ),
+ "Invalid: [att='val\\n" => array( "[att='val\n" ),
+ 'Invalid: [att="val"ix' => array( '[att="val"ix' ),
+ 'Invalid: [att="val"ix ' => array( '[att="val"ix ' ),
+ );
+ }
+}
diff --git a/tests/phpunit/tests/html-api/wpCssClassSelector.php b/tests/phpunit/tests/html-api/wpCssClassSelector.php
new file mode 100644
index 0000000000000..3328b047fa143
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpCssClassSelector.php
@@ -0,0 +1,50 @@
+assertNull( $result );
+ } else {
+ $this->assertSame( $expected, $result->class_name );
+ $this->assertSame( $rest, substr( $input, $offset ) );
+ }
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array
+ */
+ public static function data_class_selectors(): array {
+ return array(
+ 'valid ._-foo123' => array( '._-foo123', '_-foo123', '' ),
+ 'valid .foo.bar' => array( '.foo.bar', 'foo', '.bar' ),
+ 'escaped .\31 23' => array( '.\\31 23', '123', '' ),
+ 'with descendant .\31 23 div' => array( '.\\31 23 div', '123', ' div' ),
+ 'escape at EOF .foo\\' => array( '.foo\\', "foo\u{fffd}", '' ),
+
+ 'not class foo' => array( 'foo' ),
+ 'not class #bar' => array( '#bar' ),
+ 'not valid .1foo' => array( '.1foo' ),
+ );
+ }
+}
diff --git a/tests/phpunit/tests/html-api/wpCssComplexSelector.php b/tests/phpunit/tests/html-api/wpCssComplexSelector.php
new file mode 100644
index 0000000000000..8738bb6fc32d2
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpCssComplexSelector.php
@@ -0,0 +1,71 @@
+ .child#bar[baz=quux] , rest';
+ $offset = 0;
+
+ /** @var WP_CSS_Complex_Selector|null */
+ $sel = WP_CSS_Complex_Selector::parse( $input, $offset );
+
+ $this->assertSame( 2, count( $sel->context_selectors ) );
+
+ // Relative selectors should be reverse ordered.
+ $this->assertSame( 'el2', $sel->context_selectors[0][0]->type );
+ $this->assertSame( WP_CSS_Complex_Selector::COMBINATOR_CHILD, $sel->context_selectors[0][1] );
+
+ $this->assertSame( 'el1', $sel->context_selectors[1][0]->type );
+ $this->assertSame( WP_CSS_Complex_Selector::COMBINATOR_DESCENDANT, $sel->context_selectors[1][1] );
+
+ $this->assertSame( 3, count( $sel->self_selector->subclass_selectors ) );
+ $this->assertNull( $sel->self_selector->type_selector );
+ $this->assertSame( 'child', $sel->self_selector->subclass_selectors[0]->class_name );
+
+ $this->assertSame( ', rest', substr( $input, $offset ) );
+ }
+
+ /**
+ * @ticket 62653
+ */
+ public function test_parse_invalid_complex_selector() {
+ $input = 'el.foo#bar[baz=quux] > , rest';
+ $offset = 0;
+ $result = WP_CSS_Complex_Selector::parse( $input, $offset );
+ $this->assertNull( $result );
+ }
+
+ /**
+ * @ticket 62653
+ */
+ public function test_parse_invalid_complex_selector_nonfinal_subclass() {
+ $input = 'el.foo#bar[baz=quux] > final, rest';
+ $offset = 0;
+ $result = WP_CSS_Complex_Selector::parse( $input, $offset );
+ $this->assertNull( $result );
+ }
+
+ /**
+ * @ticket 62653
+ */
+ public function test_parse_empty_complex_selector() {
+ $input = '';
+ $offset = 0;
+ $result = WP_CSS_Complex_Selector::parse( $input, $offset );
+ $this->assertNull( $result );
+ }
+}
diff --git a/tests/phpunit/tests/html-api/wpCssComplexSelectorList.php b/tests/phpunit/tests/html-api/wpCssComplexSelectorList.php
new file mode 100644
index 0000000000000..b85f788f98f0d
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpCssComplexSelectorList.php
@@ -0,0 +1,65 @@
+ selector';
+ $result = WP_CSS_Complex_Selector_List::from_selectors( $input );
+ $this->assertNotNull( $result );
+ }
+
+ /**
+ * @ticket 62653
+ */
+ public function test_parse_invalid_selector_list() {
+ $input = 'el,,';
+ $result = WP_CSS_Complex_Selector_List::from_selectors( $input );
+ $this->assertNull( $result );
+ }
+
+ /**
+ * @ticket 62653
+ */
+ public function test_parse_invalid_selector_list2() {
+ $input = 'el!';
+ $result = WP_CSS_Complex_Selector_List::from_selectors( $input );
+ $this->assertNull( $result );
+ }
+
+ /**
+ * @ticket 62653
+ */
+ public function test_parse_empty_selector_list() {
+ $input = " \t \t\n\r\f";
+ $result = WP_CSS_Complex_Selector_List::from_selectors( $input );
+ $this->assertNull( $result );
+ }
+
+ /**
+ * The invalid-UTF-8 scrub notice reports the called class: through this
+ * class it must be named WP_CSS_Complex_Selector_List::from_selectors,
+ * not the WP_CSS_Compound_Selector_List parent where from_selectors()
+ * and the scrub are implemented. The fuzzer's notice model depends on
+ * the per-class name.
+ *
+ * @expectedIncorrectUsage WP_CSS_Complex_Selector_List::from_selectors
+ */
+ public function test_invalid_utf8_scrub_notice_reports_the_called_class() {
+ $result = WP_CSS_Complex_Selector_List::from_selectors( "el \xC2.child" );
+ $this->assertNotNull( $result, 'Selector with invalid UTF-8 should parse after scrubbing.' );
+ }
+}
diff --git a/tests/phpunit/tests/html-api/wpCssCompoundSelector.php b/tests/phpunit/tests/html-api/wpCssCompoundSelector.php
new file mode 100644
index 0000000000000..8092ee049b6e1
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpCssCompoundSelector.php
@@ -0,0 +1,44 @@
+ .child';
+ $offset = 0;
+ $sel = WP_CSS_Compound_Selector::parse( $input, $offset );
+
+ $this->assertSame( 'el', $sel->type_selector->type );
+ $this->assertSame( 3, count( $sel->subclass_selectors ) );
+ $this->assertSame( 'foo', $sel->subclass_selectors[0]->class_name, 'foo' );
+ $this->assertSame( 'bar', $sel->subclass_selectors[1]->id, 'bar' );
+ $this->assertSame( 'baz', $sel->subclass_selectors[2]->name, 'baz' );
+ $this->assertSame( WP_CSS_Attribute_Selector::MATCH_EXACT, $sel->subclass_selectors[2]->matcher );
+ $this->assertSame( 'quux', $sel->subclass_selectors[2]->value );
+ $this->assertSame( ' > .child', substr( $input, $offset ) );
+ }
+
+ /**
+ * @ticket 62653
+ */
+ public function test_parse_empty_selector() {
+ $input = '';
+ $offset = 0;
+ $result = WP_CSS_Compound_Selector::parse( $input, $offset );
+ $this->assertNull( $result );
+ $this->assertSame( 0, $offset );
+ }
+}
diff --git a/tests/phpunit/tests/html-api/wpCssCompoundSelectorList.php b/tests/phpunit/tests/html-api/wpCssCompoundSelectorList.php
new file mode 100644
index 0000000000000..33149c22ed400
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpCssCompoundSelectorList.php
@@ -0,0 +1,143 @@
+assertNotNull( $result );
+ }
+
+ /**
+ * @ticket 62653
+ */
+ public function test_parse_invalid_selector_list() {
+ $input = 'el,,';
+ $result = WP_CSS_Compound_Selector_List::from_selectors( $input );
+ $this->assertNull( $result );
+ }
+
+ /**
+ * @ticket 62653
+ */
+ public function test_parse_invalid_selector_list2() {
+ $input = 'el!';
+ $result = WP_CSS_Compound_Selector_List::from_selectors( $input );
+ $this->assertNull( $result );
+ }
+
+ /**
+ * An escaped whitespace code point at the end of input belongs to the
+ * ident and must survive input normalization: `.foo\ ` is the valid
+ * class `foo ` (with a space), not a backslash at the end of input.
+ *
+ * @ticket 62653
+ */
+ public function test_parse_escaped_whitespace_at_end_of_input() {
+ $result = WP_CSS_Compound_Selector_List::from_selectors( '.foo\\ ' );
+ $this->assertNotNull( $result );
+ }
+
+ /**
+ * A backslash before a newline is not a valid escape; at the end of
+ * input it must not be mistaken for trimmable trailing whitespace.
+ *
+ * @ticket 62653
+ */
+ public function test_parse_escape_before_newline_at_end_of_input_is_invalid() {
+ $result = WP_CSS_Compound_Selector_List::from_selectors( ".foo\\\n" );
+ $this->assertNull( $result );
+ }
+
+ /**
+ * @ticket 62653
+ */
+ public function test_parse_empty_selector_list() {
+ $input = " \t \t\n\r\f";
+ $result = WP_CSS_Compound_Selector_List::from_selectors( $input );
+ $this->assertNull( $result );
+ }
+
+ /**
+ * @ticket 62653
+ */
+ public function test_unsupported_complex_selector() {
+ $input = 'ancestor descendant';
+ $result = WP_CSS_Compound_Selector_List::from_selectors( $input );
+ $this->assertNull( $result );
+ }
+
+ /**
+ * Selector strings are UTF-8 text: invalid byte sequences are replaced
+ * with U+FFFD per maximal subpart (CSS Syntax §3.2 via the WHATWG
+ * Encoding Standard) before parsing, so the selector parses rather than
+ * being rejected. The replacement is almost certainly not what the
+ * developer meant, so it also triggers `_doing_it_wrong()`.
+ *
+ * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors
+ */
+ public function test_invalid_utf8_is_scrubbed_to_replacement_character_and_notifies() {
+ $result = WP_CSS_Compound_Selector_List::from_selectors( ".B\xFCcher" );
+ $this->assertNotNull( $result, 'Selector with invalid UTF-8 should parse after scrubbing.' );
+ }
+
+ /**
+ * Valid UTF-8 — including a literal U+FFFD — must parse without any
+ * incorrect-usage notice: scrubbing is the identity function on valid
+ * input.
+ */
+ public function test_valid_utf8_with_literal_replacement_character_is_not_notified() {
+ $result = WP_CSS_Compound_Selector_List::from_selectors( ".B\u{FFFD}cher" );
+ $this->assertNotNull( $result, 'Selector containing a literal U+FFFD should parse.' );
+ }
+
+ /**
+ * The whole input is scrubbed uniformly, so a selector list with invalid
+ * bytes in one of several selectors still parses as a list.
+ *
+ * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors
+ */
+ public function test_invalid_utf8_in_selector_list_is_scrubbed() {
+ $result = WP_CSS_Compound_Selector_List::from_selectors( ".ok, .B\xE2\x8Ccher" );
+ $this->assertNotNull( $result, 'Selector list with invalid UTF-8 should parse after scrubbing.' );
+ }
+
+ /**
+ * A selector consisting of nothing but an invalid byte parses: it scrubs
+ * to U+FFFD, which is an ident-start code point and therefore a valid
+ * type selector. Surprising, but it follows from the scrub running
+ * before tokenization — the parser never sees the invalid byte.
+ *
+ * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors
+ */
+ public function test_lone_invalid_byte_parses_as_replacement_character_type_selector() {
+ $result = WP_CSS_Compound_Selector_List::from_selectors( "\x80" );
+ $this->assertNotNull( $result, 'A lone invalid byte should parse as a U+FFFD type selector.' );
+ }
+
+ /**
+ * The scrub notice reports the byte replacement, which happens before
+ * parsing — it fires even when the scrubbed selector is then rejected
+ * by the grammar.
+ *
+ * @expectedIncorrectUsage WP_CSS_Compound_Selector_List::from_selectors
+ */
+ public function test_invalid_utf8_notice_fires_even_when_selector_is_rejected() {
+ $result = WP_CSS_Compound_Selector_List::from_selectors( "\x80 div" );
+ $this->assertNull( $result, 'Descendant combinators are unsupported by the compound list; the scrubbed selector should still be rejected.' );
+ }
+}
diff --git a/tests/phpunit/tests/html-api/wpCssIdSelector.php b/tests/phpunit/tests/html-api/wpCssIdSelector.php
new file mode 100644
index 0000000000000..03694fa4456e5
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpCssIdSelector.php
@@ -0,0 +1,51 @@
+assertNull( $result );
+ } else {
+ $this->assertSame( $expected, $result->id );
+ $this->assertSame( $rest, substr( $input, $offset ) );
+ }
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array
+ */
+ public static function data_id_selectors(): array {
+ return array(
+ 'valid #_-foo123' => array( '#_-foo123', '_-foo123', '' ),
+ 'valid #foo#bar' => array( '#foo#bar', 'foo', '#bar' ),
+ 'escaped #\31 23' => array( '#\\31 23', '123', '' ),
+ 'with descendant #\31 23 div' => array( '#\\31 23 div', '123', ' div' ),
+ 'escape at EOF #foo\\' => array( '#foo\\', "foo\u{fffd}", '' ),
+
+ // Invalid
+ 'not ID foo' => array( 'foo' ),
+ 'not ID .bar' => array( '.bar' ),
+ 'not valid #1foo' => array( '#1foo' ),
+ );
+ }
+}
diff --git a/tests/phpunit/tests/html-api/wpCssSelectorParserMatcher.php b/tests/phpunit/tests/html-api/wpCssSelectorParserMatcher.php
new file mode 100644
index 0000000000000..181519b3cbed3
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpCssSelectorParserMatcher.php
@@ -0,0 +1,281 @@
+original_substitute_character = mb_substitute_character();
+ mb_substitute_character( 0x2603 );
+ $this->test_class = new class() extends WP_CSS_Selector_Parser_Matcher {
+ public function matches( $processor ): bool {
+ throw new Error( 'Matches called on test class.' );
+ }
+ public static function parse( string $input, int &$offset ) {
+ throw new Error( 'Parse called on test class.' );
+ }
+
+ /*
+ * Parsing
+ */
+ public static function test_parse_ident( string $input, int &$offset ) {
+ return self::parse_ident( $input, $offset );
+ }
+
+ public static function test_parse_string( string $input, int &$offset ) {
+ return self::parse_string( $input, $offset );
+ }
+
+ /*
+ * Utilities
+ */
+ public static function test_is_ident_codepoint( string $input, int $offset ) {
+ return self::is_ident_codepoint( $input, $offset );
+ }
+
+ public static function test_is_ident_start_codepoint( string $input, int $offset ) {
+ return self::is_ident_start_codepoint( $input, $offset );
+ }
+ };
+ }
+
+ public function tear_down(): void {
+ mb_substitute_character( $this->original_substitute_character );
+ parent::tear_down();
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array
+ */
+ public static function data_idents(): array {
+ return array(
+ 'trailing #' => array( '_-foo123#xyz', '_-foo123', '#xyz' ),
+ 'trailing .' => array( '😍foo123.xyz', '😍foo123', '.xyz' ),
+ 'trailing " "' => array( '😍foo123 more', '😍foo123', ' more' ),
+ 'escaped ASCII character' => array( '\\xyz', 'xyz', '' ),
+ 'escape after multibyte character' => array( 'Ü\\sup', 'Üsup', '' ),
+ 'escape after multibyte characters' => array( 'ÜÜ\\sup', 'ÜÜsup', '' ),
+ 'hex escape after multibyte character' => array( 'Ü\\31 23', 'Ü123', '' ),
+ 'escaped space' => array( '\\ x', ' x', '' ),
+ 'escaped emoji' => array( '\\😍', '😍', '' ),
+ 'hex unicode codepoint' => array( '\\1f0a1', '🂡', '' ),
+ 'HEX UNICODE CODEPOINT' => array( '\\1D4B2', '𝒲', '' ),
+
+ 'hex tab-suffixed 1' => array( "\\31\t23", '123', '' ),
+ 'hex newline-suffixed 1' => array( "\\31\n23", '123', '' ),
+ 'hex space-suffixed 1' => array( "\\31 23", '123', '' ),
+ 'hex tab' => array( '\\9', "\t", '' ),
+ 'hex a' => array( '\\61 bc', 'abc', '' ),
+ 'hex a max escape length' => array( '\\000061bc', 'abc', '' ),
+
+ 'out of range replacement min' => array( '\\110000 ', "\u{fffd}", '' ),
+ 'out of range replacement max' => array( '\\ffffff ', "\u{fffd}", '' ),
+ 'leading surrogate min replacement' => array( '\\d800 ', "\u{fffd}", '' ),
+ 'leading surrogate max replacement' => array( '\\dbff ', "\u{fffd}", '' ),
+ 'trailing surrogate min replacement' => array( '\\dc00 ', "\u{fffd}", '' ),
+ 'trailing surrogate max replacement' => array( '\\dfff ', "\u{fffd}", '' ),
+ 'can start with -ident' => array( '-ident', '-ident', '' ),
+ 'can start with --anything' => array( '--anything', '--anything', '' ),
+ 'can start with ---anything' => array( '--_anything', '--_anything', '' ),
+ 'can start with --1anything' => array( '--1anything', '--1anything', '' ),
+ 'can start with -\31 23' => array( '-\31 23', '-123', '' ),
+ 'can start with --\31 23' => array( '--\31 23', '--123', '' ),
+ 'ident ends before ]' => array( 'ident]', 'ident', ']' ),
+
+ /*
+ * > EOF
+ * > This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�).
+ *
+ * https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point
+ */
+ 'escape at EOF' => array( 'foo\\', "foo\u{fffd}", '' ),
+ 'lone escape at EOF' => array( '\\', "\u{fffd}", '' ),
+ 'hyphen then escape at EOF' => array( '-\\', "-\u{fffd}", '' ),
+
+ // Identity escapes of multibyte characters, by UTF-8 sequence length.
+ 'escaped 2-byte character' => array( "\\\u{FC}z", "\u{FC}z", '' ),
+ 'escaped 3-byte character' => array( "\\\u{270F}z", "\u{270F}z", '' ),
+ 'escaped 4-byte character' => array( "\\\u{1F0A1}z", "\u{1F0A1}z", '' ),
+ 'escaped 2-byte character at EOF' => array( "a\\\u{FC}", "a\u{FC}", '' ),
+ 'escaped 3-byte character at EOF' => array( "a\\\u{270F}", "a\u{270F}", '' ),
+ 'escaped 4-byte character at EOF' => array( "a\\\u{1F0A1}", "a\u{1F0A1}", '' ),
+
+ /*
+ * An escaped NUL byte passes through this low-level helper unchanged.
+ * This is unreachable through the public selector API, where
+ * normalize_selector_input() replaces NUL with U+FFFD before parsing.
+ */
+ 'escaped NUL byte' => array( "a\\\x00z", "a\x00z", '' ),
+
+ /*
+ * Identity escapes of invalid UTF-8 byte sequences.
+ *
+ * These inputs are not valid UTF-8, which can only reach the parser
+ * through a direct `parse()` call: the public `from_selectors()` API
+ * replaces invalid byte sequences with U+FFFD before parsing. On
+ * this un-normalized path the escape decodes the maximal subpart of
+ * the invalid sequence (CSS Syntax §3.2 via the WHATWG Encoding
+ * Standard) to a single U+FFFD — independent of the
+ * `mb_substitute_character()` setting, which set_up() pins to ☃
+ * precisely to prove that independence. Invalid bytes *after* the
+ * escaped subpart are not escaped; they pass through this low-level
+ * helper raw, exactly as unescaped invalid bytes do (the 0xAF,
+ * 0xA0 0x80, and 0x90 0x80 0x80 tails below).
+ */
+ 'escaped lone continuation byte' => array( "a\\\x80z", "a\u{FFFD}z", '' ),
+ 'escaped overlong lead 0xC0' => array( "a\\\xC0\xAFz", "a\u{FFFD}\xAFz", '' ),
+ 'escaped invalid lead 0xF5' => array( "a\\\xF5z", "a\u{FFFD}z", '' ),
+ 'escaped truncated 3-byte sequence' => array( "a\\\xE2\x80z", "a\u{FFFD}z", '' ),
+ 'escaped truncated 4-byte at EOF' => array( "a\\\xF0\x9F\x82", "a\u{FFFD}", '' ),
+ 'escaped UTF-8-encoded surrogate' => array( "a\\\xED\xA0\x80z", "a\u{FFFD}\xA0\x80z", '' ),
+ 'escaped sequence above U+10FFFF' => array( "a\\\xF4\x90\x80\x80z", "a\u{FFFD}\x90\x80\x80z", '' ),
+
+ // Invalid
+ 'Invalid: (empty string)' => array( '' ),
+ 'Invalid: bad start >' => array( '>ident' ),
+ 'Invalid: bad start [' => array( '[ident' ),
+ 'Invalid: bad start #' => array( '#ident' ),
+ 'Invalid: bad start " "' => array( ' ident' ),
+ 'Invalid: bad start 1' => array( '1ident' ),
+ 'Invalid: bad start -1' => array( '-1ident' ),
+ 'Invalid: bad start -' => array( '-' ),
+ );
+ }
+
+ /**
+ * @ticket 62653
+ */
+ public function test_is_ident_and_is_ident_start() {
+ $this->assertFalse( $this->test_class::test_is_ident_codepoint( '[', 0 ) );
+ $this->assertFalse( $this->test_class::test_is_ident_codepoint( ']', 0 ) );
+ $this->assertFalse( $this->test_class::test_is_ident_start_codepoint( '[', 0 ) );
+ $this->assertFalse( $this->test_class::test_is_ident_start_codepoint( ']', 0 ) );
+ }
+
+ /**
+ * @ticket 62653
+ *
+ * @dataProvider data_idents
+ */
+ public function test_parse_ident( string $input, ?string $expected = null, ?string $rest = null ) {
+
+ $offset = 0;
+ $result = $this->test_class::test_parse_ident( $input, $offset );
+ if ( null === $expected ) {
+ $this->assertNull( $result );
+ } else {
+ $this->assertSame( $expected, $result, 'Ident did not match.' );
+ $this->assertSame( $rest, substr( $input, $offset ), 'Offset was not updated correctly.' );
+ }
+ }
+
+ /**
+ * The rest-of-input assertion above cannot distinguish an offset at the end
+ * of the input from one past it (`substr()` returns '' for both), so the
+ * offset arithmetic of the invalid-byte decode is pinned explicitly here:
+ * the escape consumes exactly the 1-byte maximal subpart and the following
+ * `z`, leaving the offset at — never past — the end of the input. (The
+ * previous `mb_substr()`-based decode advanced by the byte length of the
+ * substitute character and overran the end by one byte under the ☃ canary.)
+ */
+ public function test_parse_ident_escaped_invalid_byte_does_not_overrun_offset() {
+ $input = "a\\\x80z";
+ $offset = 0;
+ $result = $this->test_class::test_parse_ident( $input, $offset );
+
+ $this->assertSame( "a\u{FFFD}z", $result, 'Ident did not match.' );
+ $this->assertSame( strlen( $input ), $offset, 'Offset should stop exactly at the end of input.' );
+ }
+
+ /**
+ * @ticket 62653
+ *
+ * @dataProvider data_strings
+ */
+ public function test_parse_string( string $input, ?string $expected = null, ?string $rest = null ) {
+ $offset = 0;
+ $result = $this->test_class::test_parse_string( $input, $offset );
+ if ( null === $expected ) {
+ $this->assertNull( $result );
+ } else {
+ $this->assertSame( $expected, $result, 'String did not match.' );
+ $this->assertSame( $rest, substr( $input, $offset ), 'Offset was not updated correctly.' );
+ }
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array
+ */
+ public static function data_strings(): array {
+ return array(
+ '"foo"' => array( '"foo"', 'foo', '' ),
+ '"foo"after' => array( '"foo"after', 'foo', 'after' ),
+ '"foo""two"' => array( '"foo""two"', 'foo', '"two"' ),
+ '"foo"\'two\'' => array( '"foo"\'two\'', 'foo', "'two'" ),
+
+ "'foo'" => array( "'foo'", 'foo', '' ),
+ "'foo'after" => array( "'foo'after", 'foo', 'after' ),
+ "'foo'\"two\"" => array( "'foo'\"two\"", 'foo', '"two"' ),
+ "'foo''two'" => array( "'foo''two'", 'foo', "'two'" ),
+
+ "'foo\\nbar'" => array( "'foo\\\nbar'", 'foobar', '' ),
+ "'foo\\31 23'" => array( "'foo\\31 23'", 'foo123', '' ),
+ "'Ü\\sup'" => array( "'Ü\\sup'", 'Üsup', '' ),
+ "'foo\\31\\n23'" => array( "'foo\\31\n23'", 'foo123', '' ),
+ "'foo\\31\\t23'" => array( "'foo\\31\t23'", 'foo123', '' ),
+ "'foo\\00003123'" => array( "'foo\\00003123'", 'foo123', '' ),
+
+ "'foo\\" => array( "'foo\\", 'foo', '' ),
+
+ /*
+ * Invalid UTF-8 in string context, reachable only via a direct
+ * parse() call ( from_selectors() scrubs first ): an escaped
+ * invalid byte decodes its maximal subpart to U+FFFD, exactly as
+ * in ident context; raw invalid bytes pass through unexamined.
+ */
+ 'string with escaped invalid byte' => array( "'a\\\xC0z'", "a\u{FFFD}z", '' ),
+ 'string with raw invalid byte' => array( "'a\xC0z'", "a\xC0z", '' ),
+
+ '"' => array( '"', '', '' ),
+ '"\\"' => array( '"\\"', '"', '' ),
+ '"missing close' => array( '"missing close', 'missing close', '' ),
+
+ // Invalid
+ 'Invalid: (empty string)' => array( '' ),
+ 'Invalid: .foo' => array( '.foo' ),
+ 'Invalid: #foo' => array( '#foo' ),
+ "Invalid: 'newline\\n'" => array( "'newline\n'" ),
+ 'Invalid: foo' => array( 'foo' ),
+ );
+ }
+}
diff --git a/tests/phpunit/tests/html-api/wpCssTypeSelector.php b/tests/phpunit/tests/html-api/wpCssTypeSelector.php
new file mode 100644
index 0000000000000..94ae49bff474a
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpCssTypeSelector.php
@@ -0,0 +1,52 @@
+assertNull( $result );
+ } else {
+ $this->assertSame( $expected, $result->type );
+ $this->assertSame( $rest, substr( $input, $offset ) );
+ }
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array
+ */
+ public static function data_type_selectors(): array {
+ return array(
+ 'any *' => array( '* .class', '*', ' .class' ),
+ 'a' => array( 'a', 'a', '' ),
+ 'div.class' => array( 'div.class', 'div', '.class' ),
+ 'custom-type#id' => array( 'custom-type#id', 'custom-type', '#id' ),
+ 'escape at EOF foo\\' => array( 'foo\\', "foo\u{fffd}", '' ),
+
+ // Invalid
+ 'Invalid: (empty string)' => array( '' ),
+ 'Invalid: #id' => array( '#id' ),
+ 'Invalid: .class' => array( '.class' ),
+ 'Invalid: [attr]' => array( '[attr]' ),
+ );
+ }
+}
diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor-select.php b/tests/phpunit/tests/html-api/wpHtmlProcessor-select.php
new file mode 100644
index 0000000000000..fcb1acf3fa7d6
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpHtmlProcessor-select.php
@@ -0,0 +1,110 @@
+' );
+ $this->assertFalse( $processor->select( 'div' ) );
+ }
+
+ /**
+ * @ticket 62653
+ *
+ * @dataProvider data_selectors
+ */
+ public function test_selects_all_matches( string $html, string $selector, int $match_count ) {
+ $processor = WP_HTML_Processor::create_full_parser( $html );
+ $count = 0;
+ while ( $processor->select( $selector ) ) {
+ $breadcrumb_string = implode( ', ', $processor->get_breadcrumbs() );
+ $this->assertTrue(
+ $processor->get_attribute( 'match' ),
+ "Matched unexpected tag {$processor->get_tag()} @ {$breadcrumb_string}"
+ );
+ ++$count;
+ }
+ $this->assertSame( $match_count, $count, 'Did not match expected number of tags.' );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array
+ */
+ public static function data_selectors(): array {
+ return array(
+ 'any' => array( '', '*', 5 ),
+ 'quirks mode ID' => array( '
In quirks mode, ID matching is case-insensitive.', '#id', 2 ),
+ 'quirks mode class' => array( '
In quirks mode, class matching is case-insensitive.', '.c', 2 ),
+ 'no-quirks mode ID' => array( '
In no-quirks mode, ID matching is case-sensitive.', '#id', 1 ),
+ 'no-quirks mode class' => array( '
In no-quirks mode, class matching is case-sensitive.', '.c', 1 ),
+ 'any descendant' => array( '', 'section *', 4 ),
+ 'any child matches all children' => array( '', 'section > *', 2 ),
+
+ 'multiple complex selectors' => array( '', 'section > div p > i', 1 ),
+
+ // Per Selectors-4, the substring matchers ^= $= *= match nothing when the value
+ // is empty. ~= also matches nothing: an empty string is never a list item.
+ 'empty value ^= matches nothing' => array( '', '[x^=""]', 0 ),
+ 'empty value $= matches nothing' => array( '', '[x$=""]', 0 ),
+ 'empty value *= matches nothing' => array( '', '[x*=""]', 0 ),
+ 'empty value ~= matches nothing' => array( '', '[x~=""]', 0 ),
+ 'empty value ^= i matches nothing' => array( '', '[x^="" i]', 0 ),
+ 'empty value = matches empty' => array( '', '[x=""]', 1 ),
+ 'empty value |= matches empty or hyphen-prefixed' => array( '', '[x|=""]', 2 ),
+
+ /*
+ * HTML's case-insensitive attribute value list applies to
+ * "an HTML element in an HTML document": a foreign element with
+ * the same attribute name keeps case-sensitive matching.
+ * ( Chromium applies the list to foreign elements as well,
+ * diverging from the HTML specification here. )
+ *
+ * https://html.spec.whatwg.org/multipage/semantics-other.html#case-sensitivity-of-selectors
+ */
+ 'HTML-namespace-only attribute case-insensitivity' => array( '', '[type=TEXT]', 1 ),
+ );
+ }
+
+ /**
+ * @ticket 62653
+ *
+ * @expectedIncorrectUsage WP_HTML_Processor::select
+ *
+ * @dataProvider data_invalid_selectors
+ */
+ public function test_invalid_selector( string $selector ) {
+ $processor = WP_HTML_Processor::create_fragment( 'irrelevant' );
+ $this->assertFalse( $processor->select( $selector ) );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array
+ */
+ public static function data_invalid_selectors(): array {
+ return array(
+ 'invalid selector' => array( '[invalid!selector]' ),
+
+ // The class selectors below are not allowed in non-final position.
+ 'unsupported child selector' => array( '.parent > .child' ),
+ 'unsupported descendant selector' => array( '.ancestor .descendant' ),
+
+ // Unsupported combinators
+ 'unsupported next sibling selector' => array( 'p + p' ),
+ 'unsupported subsequent sibling selector' => array( 'p ~ p' ),
+ );
+ }
+}
diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-select.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-select.php
new file mode 100644
index 0000000000000..96bb8e1b4457d
--- /dev/null
+++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-select.php
@@ -0,0 +1,214 @@
+' );
+ $this->assertFalse( $processor->select( 'div' ) );
+ }
+
+ /**
+ * @ticket 62653
+ *
+ * @dataProvider data_selectors
+ */
+ public function test_select( string $html, string $selector, int $match_count ) {
+ $processor = new WP_HTML_Tag_Processor( $html );
+ $count = 0;
+ while ( $processor->select( $selector ) ) {
+ $this->assertTrue(
+ $processor->get_attribute( 'match' ),
+ "Matched unexpected tag {$processor->get_tag()}"
+ );
+ ++$count;
+ }
+ $this->assertSame( $match_count, $count, 'Did not match expected number of tags.' );
+ }
+
+ /**
+ * Data provider.
+ *
+ * @return array
+ */
+ public static function data_selectors(): array {
+ return array(
+ 'simple type' => array( '', 'div', 2 ),
+ 'any type' => array( '', '*', 2 ),
+ 'simple class' => array( '', '.x', 2 ),
+ 'simple id' => array( '', '#x', 2 ),
+
+ 'attribute presence' => array( '', '[att]', 2 ),
+ 'attribute empty string match' => array( '', '[att=""]', 2 ),
+ 'attribute value' => array( '