From 673e2d248520ed18c412618985940679ed17fb13 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 1 Feb 2024 13:35:40 -0700 Subject: [PATCH 01/25] HTML API: Handle parsing changes in foreign content. --- .../wpHtmlTagProcessor-token-scanning.php | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php index 4f1e1dab249d1..debfdd6541e03 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php @@ -435,6 +435,39 @@ public function test_basic_assertion_abruptly_closed_cdata_section() { ); } + /** + * Ensures that basic CDATA sections inside foreign content are detected. + * + * @ticket {TICKET_NUMBER} + */ + public function test_basic_cdata_in_foreign_content() { + $processor = new WP_HTML_Tag_Processor( 'this is >&gt; real CDATA' ); + $processor->next_token(); + $processor->next_token(); + + $this->assertSame( + '#cdata-section', + $processor->get_token_name(), + "Should have found a CDATA section but found {$processor->get_token_name()} instead." + ); + + $this->assertNull( + $processor->get_tag(), + 'Should not have been able to query tag name on non-element token.' + ); + + $this->assertNull( + $processor->get_attribute( 'type' ), + 'Should not have been able to query attributes on non-element token.' + ); + + $this->assertSame( + 'this is >> real CDATA', + $processor->get_modifiable_text(), + 'Found incorrect modifiable text.' + ); + } + /** * Ensures that normative Processing Instruction nodes are properly parsed. * From 0451bc2ec56d65ebbf64d12f2f3ab9a3147eba5b Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 1 Feb 2024 14:53:09 -0700 Subject: [PATCH 02/25] Implement via external settings. --- .../html-api/class-wp-html-tag-processor.php | 41 ++++++++++++++++++- .../wpHtmlTagProcessor-token-scanning.php | 7 ++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 1b4db41bcee12..edcc7be810d16 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -515,6 +515,21 @@ class WP_HTML_Tag_Processor { */ protected $parser_state = self::STATE_READY; + /** + * Indicates whether the parser is inside foreign content, + * e.g. inside an SVG or MathML element. + * + * Several parsing rules change based on whether the parser + * is inside foreign content, including whether CDATA sections + * are allowed and whether a self-closing flag indicates that + * an element has no content. + * + * @since 6.5.0 + * + * @var bool + */ + protected $is_inside_foreign_content = false; + /** * What kind of syntax token became an HTML comment. * @@ -944,7 +959,6 @@ public function next_token() { $duplicate_attributes = $this->duplicate_attributes; // Find the closing tag if necessary. - $found_closer = false; switch ( $tag_name ) { case 'SCRIPT': $found_closer = $this->skip_script_data(); @@ -1719,6 +1733,31 @@ private function parse_next_tag() { return true; } + if ( + $this->is_inside_foreign_content && + strlen( $html ) > $at + 8 && + '[' === $html[ $at + 2 ] && + 'C' === $html[ $at + 3 ] && + 'D' === $html[ $at + 4 ] && + 'A' === $html[ $at + 5 ] && + 'T' === $html[ $at + 6 ] && + 'A' === $html[ $at + 7 ] && + '[' === $html[ $at + 8 ] + ) { + $closer_at = strpos( $html, ']]>', $at + 1 ); + if ( false === $closer_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + + return false; + } + + $this->parser_state = self::STATE_CDATA_NODE; + $this->text_starts_at = $at + 9; + $this->text_length = $closer_at - $this->text_starts_at; + $this->token_length = $closer_at + 3 - $this->token_starts_at; + return true; + } + /* * Anything else here is an incorrectly-opened comment and transitions * to the bogus comment state - skip to the nearest >. If no closer is diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php index debfdd6541e03..3945c06a9f262 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php @@ -443,6 +443,13 @@ public function test_basic_assertion_abruptly_closed_cdata_section() { public function test_basic_cdata_in_foreign_content() { $processor = new WP_HTML_Tag_Processor( 'this is >&gt; real CDATA' ); $processor->next_token(); + + // Artificially set flag; this should be done in the HTML Processor. + $reflector = new ReflectionClass( $processor ); + $is_inside_foreign_content = $reflector->getProperty( 'is_inside_foreign_content' ); + $is_inside_foreign_content->setAccessible( true ); + $is_inside_foreign_content->setValue( $processor, true ); + $processor->next_token(); $this->assertSame( From 68446bb3d997faa4aca4d0240773107f8bdeefff Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 1 Feb 2024 21:44:33 -0700 Subject: [PATCH 03/25] Support scanning all tokens in the HTML Processor. --- .../html-api/class-wp-html-processor.php | 73 +++++++++++++------ 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 6b0879cde892f..17fd880c1a83c 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -361,6 +361,10 @@ public function get_last_error() { public function next_tag( $query = null ) { if ( null === $query ) { while ( $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( ! $this->is_tag_closer() ) { return true; } @@ -384,6 +388,10 @@ public function next_tag( $query = null ) { if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) { while ( $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( ! $this->is_tag_closer() ) { return true; } @@ -430,9 +438,7 @@ public function next_tag( $query = null ) { public function next_token() { $found_a_token = parent::next_token(); - if ( '#tag' === $this->get_token_type() ) { - $this->step( self::PROCESS_CURRENT_NODE ); - } + $this->step( self::PROCESS_CURRENT_NODE ); return $found_a_token; } @@ -529,25 +535,32 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { * is provided in the opening tag, otherwise it expects a tag closer. */ $top_node = $this->state->stack_of_open_elements->current_node(); - if ( $top_node && self::is_void( $top_node->node_name ) ) { + if ( + $top_node && + ( + self::is_void( $top_node->node_name ) || + $top_node->node_name[0] < 'A' || + $top_node->node_name[0] > 'Z' + ) + ) { $this->state->stack_of_open_elements->pop(); } } if ( self::PROCESS_NEXT_NODE === $node_to_process ) { - while ( parent::next_token() && '#tag' !== $this->get_token_type() ) { - continue; - } + parent::next_token(); } - // Finish stepping when there are no more tokens in the document. - if ( null === $this->get_tag() ) { + if ( + WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state || + WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state + ) { return false; } $this->state->current_token = new WP_HTML_Token( $this->bookmark_tag(), - $this->get_tag(), + $this->get_token_name(), $this->is_tag_closer(), $this->release_internal_bookmark_on_destruct ); @@ -591,10 +604,6 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL. */ public function get_breadcrumbs() { - if ( ! $this->get_tag() ) { - return null; - } - $breadcrumbs = array(); foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) { $breadcrumbs[] = $stack_item->node_name; @@ -619,11 +628,37 @@ public function get_breadcrumbs() { * @return bool Whether an element was found. */ private function step_in_body() { - $tag_name = $this->get_tag(); - $op_sigil = $this->is_tag_closer() ? '-' : '+'; - $op = "{$op_sigil}{$tag_name}"; + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; switch ( $op ) { + case '#comment': + $this->insert_html_element( $this->state->current_token ); + return true; + + case 'html': + // Ignore DOCTYPE declarations. + return $this->step(); + + case '#text': + $text_at = $this->bookmarks[ $this->state->current_token->bookmark_name ]->start; + $text_length = $this->bookmarks[ $this->state->current_token->bookmark_name ]->length; + + if ( 1 === $text_length && "\x00" === $this->html[ $text_at ] ) { + // Ignore this token. + return $this->step(); + } + + if ( strspn( $this->html, " \t\n\f\r", $text_at, $text_length ) !== $text_length ) { + $this->state->frameset_ok = false; + } + + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + return true; + /* * > A start tag whose tag name is "button" */ @@ -1151,10 +1186,6 @@ private function step_in_body() { * @return string|false Name of created bookmark, or false if unable to create. */ private function bookmark_tag() { - if ( ! $this->get_tag() ) { - return false; - } - if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) { $this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS; throw new Exception( 'could not allocate bookmark' ); From f2ea4df3b7901a49e6a164fa3134544259dbf1ed Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 1 Feb 2024 22:04:40 -0700 Subject: [PATCH 04/25] Update test which needs to now skip over text nodes. --- tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php b/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php index 7362a588cf3f4..546581ec3f4bc 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php @@ -132,6 +132,7 @@ public function test_in_body_skips_unexpected_button_closer() { * When encountering the BUTTON closing tag, there is no BUTTON in the stack of open elements. * It should be ignored as there's no BUTTON to close. */ + $processor->step(); $this->assertTrue( $processor->step(), 'Found no further tags when it should have found the closing DIV' ); $this->assertSame( 'DIV', $processor->get_tag(), "Did not skip unexpected BUTTON; stopped at {$processor->get_tag()}." ); $this->assertTrue( $processor->is_tag_closer(), 'Did not find that the terminal DIV tag is a closer.' ); From cfe67bacfb9645bc6c84e18d9c5e0725261b0482 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 1 Feb 2024 23:32:49 -0700 Subject: [PATCH 05/25] WIP: Support elements in foreign content. --- .../html-api/class-wp-html-processor.php | 323 +++++++++++++++++- .../html-api/class-wp-html-token.php | 18 + .../html-api/wpHtmlProcessorBreadcrumbs.php | 2 - 3 files changed, 340 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 17fd880c1a83c..85198565709c0 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -538,6 +538,7 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { if ( $top_node && ( + ( 'html' !== $top_node->namespace && $top_node->has_self_closing_flag ) || self::is_void( $top_node->node_name ) || $top_node->node_name[0] < 'A' || $top_node->node_name[0] > 'Z' @@ -558,14 +559,53 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { return false; } + $token_name = $this->get_token_name(); + $this->state->current_token = new WP_HTML_Token( $this->bookmark_tag(), - $this->get_token_name(), + $token_name, $this->is_tag_closer(), $this->release_internal_bookmark_on_destruct ); + // @todo This is the context node in a fragment but it seems to be the same in practice. + $adjusted_current_node = $this->state->stack_of_open_elements->current_node(); + try { + // @see https://html.spec.whatwg.org/#tree-construction-dispatcher + $parse_in_foreign_content = ! ( + ! $adjusted_current_node || + 'html' === $adjusted_current_node->namespace || + ( + 'mathml' === $adjusted_current_node->integration_node_type && + ! $this->is_tag_closer() && + 'MGLYPH' !== $token_name && + 'MALIGNMARK' !== $token_name + ) || + ( + 'mathml' === $adjusted_current_node->integration_node_type && + '#text' === $token_name + ) || + ( + 'mathml' === $adjusted_current_node->integration_node_type && + 'ANNOTATION-XML' === $adjusted_current_node->node_name && + ! $this->is_tag_closer() && + 'SVG' === $token_name + ) || + ( + 'html' === $adjusted_current_node->integration_node_type && + ! $this->is_tag_closer() + ) || + ( + 'html' === $adjusted_current_node->integration_node_type && + '#text' === $token_name + ) + ); + + if ( $parse_in_foreign_content ) { + return $this->step_in_foreign_content(); + } + switch ( $this->state->insertion_mode ) { case WP_HTML_Processor_State::INSERTION_MODE_IN_BODY: return $this->step_in_body(); @@ -1060,6 +1100,24 @@ private function step_in_body() { case '+TRACK': $this->insert_html_element( $this->state->current_token ); return true; + + /* + * > A start tag whose tag name is "math" + */ + case '+MATH': + $this->reconstruct_active_formatting_elements(); + $this->state->current_token->namespace = 'math'; + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "svg" + */ + case '+SVG': + $this->reconstruct_active_formatting_elements(); + $this->state->current_token->namespace = 'svg'; + $this->insert_html_element( $this->state->current_token ); + return true; } /* @@ -1172,6 +1230,193 @@ private function step_in_body() { } } + /** + * Parses next element in foreign content. + * + * This internal function performs the 'in foreign content' logic + * for the generalized WP_HTML_Processor::step() function. This is + * not a distinct insertion mode. + * + * @since 6.5.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see WP_HTML_Processor::step + * @see https://html.spec.whatwg.org/#parsing-main-inbody + * + * @return bool Whether an element was found. + */ + private function step_in_foreign_content() { + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + /* + * > A start tag whose name is "font", if the token has any attributes named "color", "face", or "size" + */ + if ( + '+FONT' === $op && + ( + null !== $this->get_attribute( 'color' ) || + null !== $this->get_attribute( 'face' ) || + null !== $this->get_attribute( 'size' ) + ) + ) { + // @todo Indicate a parse error once it's possible. + while ( null !== ( $current_node = $this->state->stack_of_open_elements->current_node() ) ) { + if ( null === $current_node->integration_node_type || 'html' === $current_node->namespace ) { + break; + } + + $this->state->stack_of_open_elements->pop(); + } + + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + switch ( $op ) { + case '#comment': + $this->insert_html_element( $this->state->current_token ); + return true; + + case 'html': + // Ignore DOCTYPE declarations. + return $this->step(); + + case '#text': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "b", "big", "blockquote", "body", "br", "center", + * > "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2", "h3", "h4", "h5", + * > "h6", "head", "hr", "i", "img", "li", "listing", "menu", "meta", "nobr", "ol", + * > "p", "pre", "ruby", "s", "small", "span", "strong", "strike", "sub", "sup", + * > "table", "tt", "u", "ul", "var" + * + * > An end tag whose tag name is "br", "p" + */ + case '+B': + case '+BIG': + case '+BLOCKQUOTE': + case '+BODY': + case '+BR': + case '+CENTER': + case '+CODE': + case '+DD': + case '+DIV': + case '+DL': + case '+DT': + case '+EM': + case '+EMBED': + case '+H1': + case '+H2': + case '+H3': + case '+H4': + case '+H5': + case '+H6': + case '+HEAD': + case '+HR': + case '+I': + case '+IMG': + case '+LI': + case '+LISTING': + case '+MENU': + case '+META': + case '+NOBR': + case '+OL': + case '+P': + case '+PRE': + case '+RUBY': + case '+S': + case '+SMALL': + case '+SPAN': + case '+STRONG': + case '+STRIKE': + case '+SUB': + case '+SUP': + case '+TABLE': + case '+TT': + case '+U': + case '+UL': + case '+VAR': + case '-BR': + case '-P': + // @todo Indicate a parse error once it's possible. + while ( null !== ( $current_node = $this->state->stack_of_open_elements->current_node() ) ) { + if ( null === $current_node->integration_node_type || 'html' === $current_node->namespace ) { + break; + } + + $this->state->stack_of_open_elements->pop(); + } + + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /* + * > Any other start tag + */ + if ( ! $this->is_tag_closer() ) { + // @todo Adjust foreign attributes; this probably should be done in get_attribute(). + + // @todo This is the adjusted current node, will it ever not be this? + $current_node = $this->state->stack_of_open_elements->current_node(); + $this->state->current_token = $current_node->namespace; + + // @todo There should be a false `onlyAddToElementStack` parameter that does stuff. + $this->insert_html_element( $this->state->current_token ); + return true; + } + + /* + * > An end tag whose name is "script", if the current node is an SVG script element. + */ + // @todo Does this rule matter here? + + /* + * > Any other end tag + */ + if ( $this->is_tag_closer() ) { + $node = $this->state->stack_of_open_elements->current_node(); + if ( $tag_name !== $node->node_name ) { + // @todo Indicate a parse error once it's possible. + } + in_foreign_content_end_tag_loop: + if ( $node === $this->state->stack_of_open_elements->stack[0] ) { + // @todo should this return $this->step() instead? + return true; + } + + /* + * > If node's tag name, converted to ASCII lowercase, is the same as the tag name + * > of the token, pop elements from the stack of open elements until node has + * > been popped from the stack, and then return. + */ + if ( $node->node_name === $tag_name ) { + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + $this->state->stack_of_open_elements->pop(); + if ( $node === $item ) { + // @todo should this return $this->step() instead? + return true; + } + } + } + + foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) { + $node = $item; + break; + } + + if ( 'html' !== $node->namespace ) { + goto in_foreign_content_end_tag_loop; + } + + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + } + /* * Internal helpers */ @@ -1642,6 +1887,82 @@ private function insert_html_element( $token ) { * HTML Specification Helpers */ + /** + * Indicates if the current token is a MathML integration point. + * + * @since 6.5.0 + * + * @see https://html.spec.whatwg.org/#mathml-text-integration-point + * + * @return bool Whether the current token is a MathML integration point. + */ + private function is_mathml_integration_point() { + $token = $this->state->current_token; + + if ( 'mathml' !== $token->namespace || 'M' !== $token->node_name[0] ) { + return false; + } + + $tag_name = $token->node_name; + + return ( + 'MI' === $tag_name || + 'MO' === $tag_name || + 'MN' === $tag_name || + 'MS' === $tag_name || + 'MTEXT' === $tag_name + ); + } + + /** + * Indicates if the current token is an HTML integration point. + * + * Note that this method must be an instance method with access + * to the current token, since it needs to examine the attributes + * of the currently-matched tag, if it's in the MathML namespace. + * Otherwise it would be required to scan the HTML and ensure that + * no other accounting is overlooked. + * + * @since 6.5.0 + * + * @see https://html.spec.whatwg.org/#html-integration-point + * + * @return bool Whether the current token is an HTML integration point. + */ + private function is_html_integration_point() { + $token = $this->state->current_token; + + if ( 'html' === $token->namespace ) { + return false; + } + + $tag_name = $token->node_name; + + if ( 'svg' === $token->namespace ) { + return ( + 'DESC' === $tag_name || + 'FOREIGNOBJECT' === $tag_name || + 'TITLE' === $tag_name + ); + } + + if ( 'mathml' === $token->namespace ) { + if ( 'ANNOTATION-XML' !== $tag_name ) { + return false; + } + + $encoding = $this->get_attribute( 'encoding' ); + + return ( + is_string( $encoding ) && + ( + 0 === strcasecmp( $encoding, 'application/xhtml+xml' ) || + 0 === strcasecmp( $encoding, 'text/html' ) + ) + ); + } + } + /** * Returns whether an element of a given name is in the HTML special category. * diff --git a/src/wp-includes/html-api/class-wp-html-token.php b/src/wp-includes/html-api/class-wp-html-token.php index 86dd7658cfcee..ca0e4ed37f2bd 100644 --- a/src/wp-includes/html-api/class-wp-html-token.php +++ b/src/wp-includes/html-api/class-wp-html-token.php @@ -60,6 +60,24 @@ class WP_HTML_Token { */ public $has_self_closing_flag = false; + /** + * Indicates if the element is an HTML element or if it's inside foreign content. + * + * @since 6.5.0 + * + * @var string 'html', 'svg', or 'math'. + */ + public $namespace = 'html'; + + /** + * Indicates which kind of integration point the element is, if any. + * + * @since 6.5.0 + * + * @var string|null 'mathml', 'html', or null if not an integration point. + */ + public $integration_node_type = null; + /** * Called when token is garbage-collected or otherwise destroyed. * diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php index 1488be91654a7..dabf04c18e5d7 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php @@ -182,7 +182,6 @@ public static function data_unsupported_elements() { 'IFRAME', 'LINK', 'MARQUEE', // Deprecated. - 'MATH', 'META', 'NOBR', // Neutralized. 'NOEMBED', // Neutralized. @@ -199,7 +198,6 @@ public static function data_unsupported_elements() { 'SCRIPT', 'SELECT', 'STYLE', - 'SVG', 'TABLE', 'TBODY', 'TD', From 749e9a30a9e03fe31ba7c4b350ca73cdc4c8344a Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 1 Feb 2024 23:54:07 -0700 Subject: [PATCH 06/25] fixup! WIP: Support elements in foreign content. --- .../html-api/class-wp-html-processor.php | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 85198565709c0..14715065ef727 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1344,12 +1344,10 @@ private function step_in_foreign_content() { case '-BR': case '-P': // @todo Indicate a parse error once it's possible. - while ( null !== ( $current_node = $this->state->stack_of_open_elements->current_node() ) ) { - if ( null === $current_node->integration_node_type || 'html' === $current_node->namespace ) { - break; - } - + $current_node = $this->state->stack_of_open_elements->current_node(); + while ( $current_node && null === $current_node->integration_node_type && 'html' !== $current_node->namespace ) { $this->state->stack_of_open_elements->pop(); + $current_node = $this->state->stack_of_open_elements->current_node(); } return $this->step( self::REPROCESS_CURRENT_NODE ); @@ -1362,8 +1360,30 @@ private function step_in_foreign_content() { // @todo Adjust foreign attributes; this probably should be done in get_attribute(). // @todo This is the adjusted current node, will it ever not be this? - $current_node = $this->state->stack_of_open_elements->current_node(); - $this->state->current_token = $current_node->namespace; + $current_node = $this->state->stack_of_open_elements->current_node(); + $this->state->current_token->namespace = $current_node->namespace; + + if ( + 'svg' === $current_node->namespace && + ( 'DESC' === $tag_name || 'FOREIGNOBJECT' === $tag_name || 'TITLE' === $tag_name ) + ) { + $this->state->current_token->integration_node_type = 'html'; + } elseif ( + 'mathml' === $current_node->namespace && + ( 'MI' === $tag_name || 'MO' === $tag_name || 'MN' === $tag_name || 'MS' === $tag_name || 'MTEXT' === $tag_name ) + ) { + $this->state->current_token->integration_node_type = 'mathml'; + } elseif ( 'mathml' === $current_node->namespace && 'ANNOTATION_XML' === $tag_name ) { + $encoding = $this->get_attribute( 'encoding' ); + + if ( is_string( $encoding ) ) { + $encoding = strtolower( $encoding ); + + if ( 'text/html' === $encoding || 'application/xhtml+xml' === $encoding ) { + $this->state->current_token->integration_node_type = 'html'; + } + } + } // @todo There should be a false `onlyAddToElementStack` parameter that does stuff. $this->insert_html_element( $this->state->current_token ); @@ -1413,7 +1433,7 @@ private function step_in_foreign_content() { goto in_foreign_content_end_tag_loop; } - return $this->step( self::REPROCESS_CURRENT_NODE ); + return $this->step( self::PROCESS_CURRENT_NODE ); } } From 32bbc011e3cea7d2d41020ca071f5ca3172f1dcb Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 2 Feb 2024 00:20:20 -0700 Subject: [PATCH 07/25] Fix: store self-closing flag, not end-tag flag. --- src/wp-includes/html-api/class-wp-html-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 14715065ef727..28ea379200d74 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -564,7 +564,7 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { $this->state->current_token = new WP_HTML_Token( $this->bookmark_tag(), $token_name, - $this->is_tag_closer(), + $this->has_self_closing_flag(), $this->release_internal_bookmark_on_destruct ); From 4c9b373f159f9d8ff72a078d4baaf448bbdf9ffd Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 2 Feb 2024 00:25:44 -0700 Subject: [PATCH 08/25] Update labels, collapse next_token() to step(). --- .../html-api/class-wp-html-processor.php | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 28ea379200d74..7e71fd2c8ec72 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -422,25 +422,14 @@ public function next_tag( $query = null ) { } /** - * Ensures internal accounting is maintained for HTML semantic rules while - * the underlying Tag Processor class is seeking to a bookmark. + * Advances to the next token in the document. * - * This doesn't currently have a way to represent non-tags and doesn't process - * semantic rules for text nodes. For access to the raw tokens consider using - * WP_HTML_Tag_Processor instead. - * - * @since 6.5.0 Added for internal support; do not use. - * - * @access private + * @since 6.5.0 * - * @return bool + * @return bool Whether a token was found. */ public function next_token() { - $found_a_token = parent::next_token(); - - $this->step( self::PROCESS_CURRENT_NODE ); - - return $found_a_token; + return $this->step(); } /** @@ -501,7 +490,7 @@ public function matches_breadcrumbs( $breadcrumbs ) { } /** - * Steps through the HTML document and stop at the next tag, if any. + * Steps through the HTML document and stop at the next token, if any. * * @since 6.4.0 * @@ -511,7 +500,7 @@ public function matches_breadcrumbs( $breadcrumbs ) { * @see self::REPROCESS_CURRENT_NODE * * @param string $node_to_process Whether to parse the next node or reprocess the current node. - * @return bool Whether a tag was matched. + * @return bool Whether a token was matched. */ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { // Refuse to proceed if there was a previous error. From 3f33f68e751e3d6f922f112ae4debcfe88fb8963 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 1 Feb 2024 13:35:40 -0700 Subject: [PATCH 09/25] HTML API: Handle parsing changes in foreign content. --- .../wpHtmlTagProcessor-token-scanning.php | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php index 4f1e1dab249d1..debfdd6541e03 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php @@ -435,6 +435,39 @@ public function test_basic_assertion_abruptly_closed_cdata_section() { ); } + /** + * Ensures that basic CDATA sections inside foreign content are detected. + * + * @ticket {TICKET_NUMBER} + */ + public function test_basic_cdata_in_foreign_content() { + $processor = new WP_HTML_Tag_Processor( 'this is >&gt; real CDATA' ); + $processor->next_token(); + $processor->next_token(); + + $this->assertSame( + '#cdata-section', + $processor->get_token_name(), + "Should have found a CDATA section but found {$processor->get_token_name()} instead." + ); + + $this->assertNull( + $processor->get_tag(), + 'Should not have been able to query tag name on non-element token.' + ); + + $this->assertNull( + $processor->get_attribute( 'type' ), + 'Should not have been able to query attributes on non-element token.' + ); + + $this->assertSame( + 'this is >> real CDATA', + $processor->get_modifiable_text(), + 'Found incorrect modifiable text.' + ); + } + /** * Ensures that normative Processing Instruction nodes are properly parsed. * From 4710045c8053568da87ba1f154baadcf2099a495 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 1 Feb 2024 14:53:09 -0700 Subject: [PATCH 10/25] Implement via external settings. --- .../html-api/class-wp-html-tag-processor.php | 41 ++++++++++++++++++- .../wpHtmlTagProcessor-token-scanning.php | 7 ++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 1b4db41bcee12..edcc7be810d16 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -515,6 +515,21 @@ class WP_HTML_Tag_Processor { */ protected $parser_state = self::STATE_READY; + /** + * Indicates whether the parser is inside foreign content, + * e.g. inside an SVG or MathML element. + * + * Several parsing rules change based on whether the parser + * is inside foreign content, including whether CDATA sections + * are allowed and whether a self-closing flag indicates that + * an element has no content. + * + * @since 6.5.0 + * + * @var bool + */ + protected $is_inside_foreign_content = false; + /** * What kind of syntax token became an HTML comment. * @@ -944,7 +959,6 @@ public function next_token() { $duplicate_attributes = $this->duplicate_attributes; // Find the closing tag if necessary. - $found_closer = false; switch ( $tag_name ) { case 'SCRIPT': $found_closer = $this->skip_script_data(); @@ -1719,6 +1733,31 @@ private function parse_next_tag() { return true; } + if ( + $this->is_inside_foreign_content && + strlen( $html ) > $at + 8 && + '[' === $html[ $at + 2 ] && + 'C' === $html[ $at + 3 ] && + 'D' === $html[ $at + 4 ] && + 'A' === $html[ $at + 5 ] && + 'T' === $html[ $at + 6 ] && + 'A' === $html[ $at + 7 ] && + '[' === $html[ $at + 8 ] + ) { + $closer_at = strpos( $html, ']]>', $at + 1 ); + if ( false === $closer_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + + return false; + } + + $this->parser_state = self::STATE_CDATA_NODE; + $this->text_starts_at = $at + 9; + $this->text_length = $closer_at - $this->text_starts_at; + $this->token_length = $closer_at + 3 - $this->token_starts_at; + return true; + } + /* * Anything else here is an incorrectly-opened comment and transitions * to the bogus comment state - skip to the nearest >. If no closer is diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php index debfdd6541e03..3945c06a9f262 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-token-scanning.php @@ -443,6 +443,13 @@ public function test_basic_assertion_abruptly_closed_cdata_section() { public function test_basic_cdata_in_foreign_content() { $processor = new WP_HTML_Tag_Processor( 'this is >&gt; real CDATA' ); $processor->next_token(); + + // Artificially set flag; this should be done in the HTML Processor. + $reflector = new ReflectionClass( $processor ); + $is_inside_foreign_content = $reflector->getProperty( 'is_inside_foreign_content' ); + $is_inside_foreign_content->setAccessible( true ); + $is_inside_foreign_content->setValue( $processor, true ); + $processor->next_token(); $this->assertSame( From 52c9c9cc895a8d48253497b8cf7cf3f85f83526f Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 1 Feb 2024 21:44:33 -0700 Subject: [PATCH 11/25] Support scanning all tokens in the HTML Processor. --- .../html-api/class-wp-html-processor.php | 73 +++++++++++++------ 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 6b0879cde892f..17fd880c1a83c 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -361,6 +361,10 @@ public function get_last_error() { public function next_tag( $query = null ) { if ( null === $query ) { while ( $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( ! $this->is_tag_closer() ) { return true; } @@ -384,6 +388,10 @@ public function next_tag( $query = null ) { if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) { while ( $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( ! $this->is_tag_closer() ) { return true; } @@ -430,9 +438,7 @@ public function next_tag( $query = null ) { public function next_token() { $found_a_token = parent::next_token(); - if ( '#tag' === $this->get_token_type() ) { - $this->step( self::PROCESS_CURRENT_NODE ); - } + $this->step( self::PROCESS_CURRENT_NODE ); return $found_a_token; } @@ -529,25 +535,32 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { * is provided in the opening tag, otherwise it expects a tag closer. */ $top_node = $this->state->stack_of_open_elements->current_node(); - if ( $top_node && self::is_void( $top_node->node_name ) ) { + if ( + $top_node && + ( + self::is_void( $top_node->node_name ) || + $top_node->node_name[0] < 'A' || + $top_node->node_name[0] > 'Z' + ) + ) { $this->state->stack_of_open_elements->pop(); } } if ( self::PROCESS_NEXT_NODE === $node_to_process ) { - while ( parent::next_token() && '#tag' !== $this->get_token_type() ) { - continue; - } + parent::next_token(); } - // Finish stepping when there are no more tokens in the document. - if ( null === $this->get_tag() ) { + if ( + WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state || + WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state + ) { return false; } $this->state->current_token = new WP_HTML_Token( $this->bookmark_tag(), - $this->get_tag(), + $this->get_token_name(), $this->is_tag_closer(), $this->release_internal_bookmark_on_destruct ); @@ -591,10 +604,6 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL. */ public function get_breadcrumbs() { - if ( ! $this->get_tag() ) { - return null; - } - $breadcrumbs = array(); foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) { $breadcrumbs[] = $stack_item->node_name; @@ -619,11 +628,37 @@ public function get_breadcrumbs() { * @return bool Whether an element was found. */ private function step_in_body() { - $tag_name = $this->get_tag(); - $op_sigil = $this->is_tag_closer() ? '-' : '+'; - $op = "{$op_sigil}{$tag_name}"; + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; switch ( $op ) { + case '#comment': + $this->insert_html_element( $this->state->current_token ); + return true; + + case 'html': + // Ignore DOCTYPE declarations. + return $this->step(); + + case '#text': + $text_at = $this->bookmarks[ $this->state->current_token->bookmark_name ]->start; + $text_length = $this->bookmarks[ $this->state->current_token->bookmark_name ]->length; + + if ( 1 === $text_length && "\x00" === $this->html[ $text_at ] ) { + // Ignore this token. + return $this->step(); + } + + if ( strspn( $this->html, " \t\n\f\r", $text_at, $text_length ) !== $text_length ) { + $this->state->frameset_ok = false; + } + + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + return true; + /* * > A start tag whose tag name is "button" */ @@ -1151,10 +1186,6 @@ private function step_in_body() { * @return string|false Name of created bookmark, or false if unable to create. */ private function bookmark_tag() { - if ( ! $this->get_tag() ) { - return false; - } - if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) { $this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS; throw new Exception( 'could not allocate bookmark' ); From 716491b35fcda15927445f28c361b944b4038495 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 1 Feb 2024 22:04:40 -0700 Subject: [PATCH 12/25] Update test which needs to now skip over text nodes. --- tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php b/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php index 7362a588cf3f4..546581ec3f4bc 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorSemanticRules.php @@ -132,6 +132,7 @@ public function test_in_body_skips_unexpected_button_closer() { * When encountering the BUTTON closing tag, there is no BUTTON in the stack of open elements. * It should be ignored as there's no BUTTON to close. */ + $processor->step(); $this->assertTrue( $processor->step(), 'Found no further tags when it should have found the closing DIV' ); $this->assertSame( 'DIV', $processor->get_tag(), "Did not skip unexpected BUTTON; stopped at {$processor->get_tag()}." ); $this->assertTrue( $processor->is_tag_closer(), 'Did not find that the terminal DIV tag is a closer.' ); From e88e5f99a998bac078bb0a3abf2d004acbe8bb07 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 1 Feb 2024 23:32:49 -0700 Subject: [PATCH 13/25] WIP: Support elements in foreign content. --- .../html-api/class-wp-html-processor.php | 323 +++++++++++++++++- .../html-api/class-wp-html-token.php | 18 + .../html-api/wpHtmlProcessorBreadcrumbs.php | 2 - 3 files changed, 340 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 17fd880c1a83c..85198565709c0 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -538,6 +538,7 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { if ( $top_node && ( + ( 'html' !== $top_node->namespace && $top_node->has_self_closing_flag ) || self::is_void( $top_node->node_name ) || $top_node->node_name[0] < 'A' || $top_node->node_name[0] > 'Z' @@ -558,14 +559,53 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { return false; } + $token_name = $this->get_token_name(); + $this->state->current_token = new WP_HTML_Token( $this->bookmark_tag(), - $this->get_token_name(), + $token_name, $this->is_tag_closer(), $this->release_internal_bookmark_on_destruct ); + // @todo This is the context node in a fragment but it seems to be the same in practice. + $adjusted_current_node = $this->state->stack_of_open_elements->current_node(); + try { + // @see https://html.spec.whatwg.org/#tree-construction-dispatcher + $parse_in_foreign_content = ! ( + ! $adjusted_current_node || + 'html' === $adjusted_current_node->namespace || + ( + 'mathml' === $adjusted_current_node->integration_node_type && + ! $this->is_tag_closer() && + 'MGLYPH' !== $token_name && + 'MALIGNMARK' !== $token_name + ) || + ( + 'mathml' === $adjusted_current_node->integration_node_type && + '#text' === $token_name + ) || + ( + 'mathml' === $adjusted_current_node->integration_node_type && + 'ANNOTATION-XML' === $adjusted_current_node->node_name && + ! $this->is_tag_closer() && + 'SVG' === $token_name + ) || + ( + 'html' === $adjusted_current_node->integration_node_type && + ! $this->is_tag_closer() + ) || + ( + 'html' === $adjusted_current_node->integration_node_type && + '#text' === $token_name + ) + ); + + if ( $parse_in_foreign_content ) { + return $this->step_in_foreign_content(); + } + switch ( $this->state->insertion_mode ) { case WP_HTML_Processor_State::INSERTION_MODE_IN_BODY: return $this->step_in_body(); @@ -1060,6 +1100,24 @@ private function step_in_body() { case '+TRACK': $this->insert_html_element( $this->state->current_token ); return true; + + /* + * > A start tag whose tag name is "math" + */ + case '+MATH': + $this->reconstruct_active_formatting_elements(); + $this->state->current_token->namespace = 'math'; + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "svg" + */ + case '+SVG': + $this->reconstruct_active_formatting_elements(); + $this->state->current_token->namespace = 'svg'; + $this->insert_html_element( $this->state->current_token ); + return true; } /* @@ -1172,6 +1230,193 @@ private function step_in_body() { } } + /** + * Parses next element in foreign content. + * + * This internal function performs the 'in foreign content' logic + * for the generalized WP_HTML_Processor::step() function. This is + * not a distinct insertion mode. + * + * @since 6.5.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see WP_HTML_Processor::step + * @see https://html.spec.whatwg.org/#parsing-main-inbody + * + * @return bool Whether an element was found. + */ + private function step_in_foreign_content() { + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + /* + * > A start tag whose name is "font", if the token has any attributes named "color", "face", or "size" + */ + if ( + '+FONT' === $op && + ( + null !== $this->get_attribute( 'color' ) || + null !== $this->get_attribute( 'face' ) || + null !== $this->get_attribute( 'size' ) + ) + ) { + // @todo Indicate a parse error once it's possible. + while ( null !== ( $current_node = $this->state->stack_of_open_elements->current_node() ) ) { + if ( null === $current_node->integration_node_type || 'html' === $current_node->namespace ) { + break; + } + + $this->state->stack_of_open_elements->pop(); + } + + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + switch ( $op ) { + case '#comment': + $this->insert_html_element( $this->state->current_token ); + return true; + + case 'html': + // Ignore DOCTYPE declarations. + return $this->step(); + + case '#text': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "b", "big", "blockquote", "body", "br", "center", + * > "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2", "h3", "h4", "h5", + * > "h6", "head", "hr", "i", "img", "li", "listing", "menu", "meta", "nobr", "ol", + * > "p", "pre", "ruby", "s", "small", "span", "strong", "strike", "sub", "sup", + * > "table", "tt", "u", "ul", "var" + * + * > An end tag whose tag name is "br", "p" + */ + case '+B': + case '+BIG': + case '+BLOCKQUOTE': + case '+BODY': + case '+BR': + case '+CENTER': + case '+CODE': + case '+DD': + case '+DIV': + case '+DL': + case '+DT': + case '+EM': + case '+EMBED': + case '+H1': + case '+H2': + case '+H3': + case '+H4': + case '+H5': + case '+H6': + case '+HEAD': + case '+HR': + case '+I': + case '+IMG': + case '+LI': + case '+LISTING': + case '+MENU': + case '+META': + case '+NOBR': + case '+OL': + case '+P': + case '+PRE': + case '+RUBY': + case '+S': + case '+SMALL': + case '+SPAN': + case '+STRONG': + case '+STRIKE': + case '+SUB': + case '+SUP': + case '+TABLE': + case '+TT': + case '+U': + case '+UL': + case '+VAR': + case '-BR': + case '-P': + // @todo Indicate a parse error once it's possible. + while ( null !== ( $current_node = $this->state->stack_of_open_elements->current_node() ) ) { + if ( null === $current_node->integration_node_type || 'html' === $current_node->namespace ) { + break; + } + + $this->state->stack_of_open_elements->pop(); + } + + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /* + * > Any other start tag + */ + if ( ! $this->is_tag_closer() ) { + // @todo Adjust foreign attributes; this probably should be done in get_attribute(). + + // @todo This is the adjusted current node, will it ever not be this? + $current_node = $this->state->stack_of_open_elements->current_node(); + $this->state->current_token = $current_node->namespace; + + // @todo There should be a false `onlyAddToElementStack` parameter that does stuff. + $this->insert_html_element( $this->state->current_token ); + return true; + } + + /* + * > An end tag whose name is "script", if the current node is an SVG script element. + */ + // @todo Does this rule matter here? + + /* + * > Any other end tag + */ + if ( $this->is_tag_closer() ) { + $node = $this->state->stack_of_open_elements->current_node(); + if ( $tag_name !== $node->node_name ) { + // @todo Indicate a parse error once it's possible. + } + in_foreign_content_end_tag_loop: + if ( $node === $this->state->stack_of_open_elements->stack[0] ) { + // @todo should this return $this->step() instead? + return true; + } + + /* + * > If node's tag name, converted to ASCII lowercase, is the same as the tag name + * > of the token, pop elements from the stack of open elements until node has + * > been popped from the stack, and then return. + */ + if ( $node->node_name === $tag_name ) { + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + $this->state->stack_of_open_elements->pop(); + if ( $node === $item ) { + // @todo should this return $this->step() instead? + return true; + } + } + } + + foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) { + $node = $item; + break; + } + + if ( 'html' !== $node->namespace ) { + goto in_foreign_content_end_tag_loop; + } + + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + } + /* * Internal helpers */ @@ -1642,6 +1887,82 @@ private function insert_html_element( $token ) { * HTML Specification Helpers */ + /** + * Indicates if the current token is a MathML integration point. + * + * @since 6.5.0 + * + * @see https://html.spec.whatwg.org/#mathml-text-integration-point + * + * @return bool Whether the current token is a MathML integration point. + */ + private function is_mathml_integration_point() { + $token = $this->state->current_token; + + if ( 'mathml' !== $token->namespace || 'M' !== $token->node_name[0] ) { + return false; + } + + $tag_name = $token->node_name; + + return ( + 'MI' === $tag_name || + 'MO' === $tag_name || + 'MN' === $tag_name || + 'MS' === $tag_name || + 'MTEXT' === $tag_name + ); + } + + /** + * Indicates if the current token is an HTML integration point. + * + * Note that this method must be an instance method with access + * to the current token, since it needs to examine the attributes + * of the currently-matched tag, if it's in the MathML namespace. + * Otherwise it would be required to scan the HTML and ensure that + * no other accounting is overlooked. + * + * @since 6.5.0 + * + * @see https://html.spec.whatwg.org/#html-integration-point + * + * @return bool Whether the current token is an HTML integration point. + */ + private function is_html_integration_point() { + $token = $this->state->current_token; + + if ( 'html' === $token->namespace ) { + return false; + } + + $tag_name = $token->node_name; + + if ( 'svg' === $token->namespace ) { + return ( + 'DESC' === $tag_name || + 'FOREIGNOBJECT' === $tag_name || + 'TITLE' === $tag_name + ); + } + + if ( 'mathml' === $token->namespace ) { + if ( 'ANNOTATION-XML' !== $tag_name ) { + return false; + } + + $encoding = $this->get_attribute( 'encoding' ); + + return ( + is_string( $encoding ) && + ( + 0 === strcasecmp( $encoding, 'application/xhtml+xml' ) || + 0 === strcasecmp( $encoding, 'text/html' ) + ) + ); + } + } + /** * Returns whether an element of a given name is in the HTML special category. * diff --git a/src/wp-includes/html-api/class-wp-html-token.php b/src/wp-includes/html-api/class-wp-html-token.php index 86dd7658cfcee..ca0e4ed37f2bd 100644 --- a/src/wp-includes/html-api/class-wp-html-token.php +++ b/src/wp-includes/html-api/class-wp-html-token.php @@ -60,6 +60,24 @@ class WP_HTML_Token { */ public $has_self_closing_flag = false; + /** + * Indicates if the element is an HTML element or if it's inside foreign content. + * + * @since 6.5.0 + * + * @var string 'html', 'svg', or 'math'. + */ + public $namespace = 'html'; + + /** + * Indicates which kind of integration point the element is, if any. + * + * @since 6.5.0 + * + * @var string|null 'mathml', 'html', or null if not an integration point. + */ + public $integration_node_type = null; + /** * Called when token is garbage-collected or otherwise destroyed. * diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php index 1488be91654a7..dabf04c18e5d7 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorBreadcrumbs.php @@ -182,7 +182,6 @@ public static function data_unsupported_elements() { 'IFRAME', 'LINK', 'MARQUEE', // Deprecated. - 'MATH', 'META', 'NOBR', // Neutralized. 'NOEMBED', // Neutralized. @@ -199,7 +198,6 @@ public static function data_unsupported_elements() { 'SCRIPT', 'SELECT', 'STYLE', - 'SVG', 'TABLE', 'TBODY', 'TD', From 71416e30bdd3a2d2a928cfabd0db9683b8660b92 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 1 Feb 2024 23:54:07 -0700 Subject: [PATCH 14/25] fixup! WIP: Support elements in foreign content. --- .../html-api/class-wp-html-processor.php | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 85198565709c0..14715065ef727 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1344,12 +1344,10 @@ private function step_in_foreign_content() { case '-BR': case '-P': // @todo Indicate a parse error once it's possible. - while ( null !== ( $current_node = $this->state->stack_of_open_elements->current_node() ) ) { - if ( null === $current_node->integration_node_type || 'html' === $current_node->namespace ) { - break; - } - + $current_node = $this->state->stack_of_open_elements->current_node(); + while ( $current_node && null === $current_node->integration_node_type && 'html' !== $current_node->namespace ) { $this->state->stack_of_open_elements->pop(); + $current_node = $this->state->stack_of_open_elements->current_node(); } return $this->step( self::REPROCESS_CURRENT_NODE ); @@ -1362,8 +1360,30 @@ private function step_in_foreign_content() { // @todo Adjust foreign attributes; this probably should be done in get_attribute(). // @todo This is the adjusted current node, will it ever not be this? - $current_node = $this->state->stack_of_open_elements->current_node(); - $this->state->current_token = $current_node->namespace; + $current_node = $this->state->stack_of_open_elements->current_node(); + $this->state->current_token->namespace = $current_node->namespace; + + if ( + 'svg' === $current_node->namespace && + ( 'DESC' === $tag_name || 'FOREIGNOBJECT' === $tag_name || 'TITLE' === $tag_name ) + ) { + $this->state->current_token->integration_node_type = 'html'; + } elseif ( + 'mathml' === $current_node->namespace && + ( 'MI' === $tag_name || 'MO' === $tag_name || 'MN' === $tag_name || 'MS' === $tag_name || 'MTEXT' === $tag_name ) + ) { + $this->state->current_token->integration_node_type = 'mathml'; + } elseif ( 'mathml' === $current_node->namespace && 'ANNOTATION_XML' === $tag_name ) { + $encoding = $this->get_attribute( 'encoding' ); + + if ( is_string( $encoding ) ) { + $encoding = strtolower( $encoding ); + + if ( 'text/html' === $encoding || 'application/xhtml+xml' === $encoding ) { + $this->state->current_token->integration_node_type = 'html'; + } + } + } // @todo There should be a false `onlyAddToElementStack` parameter that does stuff. $this->insert_html_element( $this->state->current_token ); @@ -1413,7 +1433,7 @@ private function step_in_foreign_content() { goto in_foreign_content_end_tag_loop; } - return $this->step( self::REPROCESS_CURRENT_NODE ); + return $this->step( self::PROCESS_CURRENT_NODE ); } } From 823698c54632323672f399b1c5968f3287779775 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 2 Feb 2024 00:20:20 -0700 Subject: [PATCH 15/25] Fix: store self-closing flag, not end-tag flag. --- src/wp-includes/html-api/class-wp-html-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 14715065ef727..28ea379200d74 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -564,7 +564,7 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { $this->state->current_token = new WP_HTML_Token( $this->bookmark_tag(), $token_name, - $this->is_tag_closer(), + $this->has_self_closing_flag(), $this->release_internal_bookmark_on_destruct ); From 2a3743ead1b23d3bc76c35b43ad1de29fbb84945 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 2 Feb 2024 00:25:44 -0700 Subject: [PATCH 16/25] Update labels, collapse next_token() to step(). --- .../html-api/class-wp-html-processor.php | 23 +++++-------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 28ea379200d74..7e71fd2c8ec72 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -422,25 +422,14 @@ public function next_tag( $query = null ) { } /** - * Ensures internal accounting is maintained for HTML semantic rules while - * the underlying Tag Processor class is seeking to a bookmark. + * Advances to the next token in the document. * - * This doesn't currently have a way to represent non-tags and doesn't process - * semantic rules for text nodes. For access to the raw tokens consider using - * WP_HTML_Tag_Processor instead. - * - * @since 6.5.0 Added for internal support; do not use. - * - * @access private + * @since 6.5.0 * - * @return bool + * @return bool Whether a token was found. */ public function next_token() { - $found_a_token = parent::next_token(); - - $this->step( self::PROCESS_CURRENT_NODE ); - - return $found_a_token; + return $this->step(); } /** @@ -501,7 +490,7 @@ public function matches_breadcrumbs( $breadcrumbs ) { } /** - * Steps through the HTML document and stop at the next tag, if any. + * Steps through the HTML document and stop at the next token, if any. * * @since 6.4.0 * @@ -511,7 +500,7 @@ public function matches_breadcrumbs( $breadcrumbs ) { * @see self::REPROCESS_CURRENT_NODE * * @param string $node_to_process Whether to parse the next node or reprocess the current node. - * @return bool Whether a tag was matched. + * @return bool Whether a token was matched. */ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { // Refuse to proceed if there was a previous error. From 1dc75c25b0801c53a71f038f30b808ae4b4577bc Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 2 Feb 2024 13:31:57 +0100 Subject: [PATCH 17/25] Fix math (not mathml) namespace --- src/wp-includes/html-api/class-wp-html-processor.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 7e71fd2c8ec72..10e6baf08cc55 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1358,11 +1358,11 @@ private function step_in_foreign_content() { ) { $this->state->current_token->integration_node_type = 'html'; } elseif ( - 'mathml' === $current_node->namespace && + 'math' === $current_node->namespace && ( 'MI' === $tag_name || 'MO' === $tag_name || 'MN' === $tag_name || 'MS' === $tag_name || 'MTEXT' === $tag_name ) ) { $this->state->current_token->integration_node_type = 'mathml'; - } elseif ( 'mathml' === $current_node->namespace && 'ANNOTATION_XML' === $tag_name ) { + } elseif ( 'math' === $current_node->namespace && 'ANNOTATION_XML' === $tag_name ) { $encoding = $this->get_attribute( 'encoding' ); if ( is_string( $encoding ) ) { @@ -1908,7 +1908,7 @@ private function insert_html_element( $token ) { private function is_mathml_integration_point() { $token = $this->state->current_token; - if ( 'mathml' !== $token->namespace || 'M' !== $token->node_name[0] ) { + if ( 'math' !== $token->namespace || 'M' !== $token->node_name[0] ) { return false; } @@ -1955,7 +1955,7 @@ private function is_html_integration_point() { ); } - if ( 'mathml' === $token->namespace ) { + if ( 'math' === $token->namespace ) { if ( 'ANNOTATION-XML' !== $tag_name ) { return false; } From 771354b795a330181a7aa3e3ae5c4a4a47cc26ee Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 2 Feb 2024 13:32:14 +0100 Subject: [PATCH 18/25] Fix ANNOTATION-XML tag name --- src/wp-includes/html-api/class-wp-html-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 10e6baf08cc55..d2b9f79805069 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1362,7 +1362,7 @@ private function step_in_foreign_content() { ( 'MI' === $tag_name || 'MO' === $tag_name || 'MN' === $tag_name || 'MS' === $tag_name || 'MTEXT' === $tag_name ) ) { $this->state->current_token->integration_node_type = 'mathml'; - } elseif ( 'math' === $current_node->namespace && 'ANNOTATION_XML' === $tag_name ) { + } elseif ( 'math' === $current_node->namespace && 'ANNOTATION-XML' === $tag_name ) { $encoding = $this->get_attribute( 'encoding' ); if ( is_string( $encoding ) ) { From 7e3f9cd391b695fa1ae00017a834b9569b22f5af Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 2 Feb 2024 12:49:19 +0100 Subject: [PATCH 19/25] html5lib-tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Squashed commit of the following: commit 8f746ff09441ccbf46524418fe50bd218df8d7e6 Author: Jon Surrell Date: Fri Feb 2 11:53:22 2024 +0100 var_export our token type commit a3686face727b910728bb1d0b590471b956ba30c Author: Jon Surrell Date: Wed Jan 31 20:24:48 2024 +0100 Fix lint commit 82db1174a0be80aef890553d4ee87bd28cdb0802 Author: Jon Surrell Date: Wed Jan 31 19:08:23 2024 +0100 Skip whitespace test commit a3f9866666f65f6c82afacbf9f70916a79862434 Author: Jon Surrell Date: Wed Jan 31 18:58:47 2024 +0100 Throw on unhandled token types commit 742afc9ec3182dbf53476d881aa104c80a9da22f Author: Jon Surrell Date: Wed Jan 31 17:53:01 2024 +0100 Handle CDATA lookalike comment types commit b1372702c1aaada0f2b4138f06f1a2d06ce74550 Author: Jon Surrell Date: Wed Jan 31 17:37:56 2024 +0100 fix some test skipping commit ba1036d27de16e82bd3278898a11ef03114df8cf Author: Jon Surrell Date: Wed Jan 31 16:46:30 2024 +0100 Add description to test class commit 6a4967ea05c07aa7de0a7c6984477cbf236a4e5f Author: Jon Surrell Date: Wed Jan 31 16:37:32 2024 +0100 Expand README and add update instructions commit c428e287de184490e135e3aa3fa38d92c96c9da5 Author: Jon Surrell Date: Wed Jan 31 16:26:37 2024 +0100 Replace $p with $processor commit e4a45959b6c94593b62efb2747a13472e9e64a04 Author: Jon Surrell Date: Wed Jan 31 16:24:47 2024 +0100 Fix void tag indenting commit f5ad48c0e734e730c126f75c71baebee75598d8c Author: Jon Surrell Date: Tue Jan 30 19:29:24 2024 +0100 Remove leading class body space commit 0dc5dfcc3751a162414287bbafea1787b5656c92 Author: Jon Surrell Date: Tue Jan 30 16:43:06 2024 +0100 Update skips commit 18e3c5b5efb13db6141801029a31a75549bb559b Author: Jon Surrell Date: Tue Jan 30 16:12:56 2024 +0100 Skip all entities for now commit e070d40f1efe7121ac02878d14991e65f77421c4 Author: Jon Surrell Date: Tue Jan 30 16:04:28 2024 +0100 Better variable name commit 1652f4a7764120e404f3fca02ddf115c71ed78db Author: Jon Surrell Date: Tue Jan 30 15:04:08 2024 +0100 Skip entities tests Decoding with PHP is buggy commit d2a182e8afe2310794dd334978e488eae1b8bfd9 Author: Jon Surrell Date: Tue Jan 30 14:57:35 2024 +0100 Add much more HTML to tests commit 79a286763d84c412b28113833d02a8781495f1a5 Author: Jon Surrell Date: Tue Jan 30 12:58:48 2024 +0100 Remove comment test skip commit 9bcc39b1613da56463c928a0c937529d6d425782 Author: Jon Surrell Date: Tue Jan 30 12:55:35 2024 +0100 Remove covers commit 1bd74b2418ab6c809059dad6c8eb7f2874f485bf Author: Jon Surrell Date: Tue Jan 30 12:55:27 2024 +0100 Add todo comments commit 8ebe41808edada59544db88f30d4ef96c203f54f Author: Jon Surrell Date: Mon Jan 29 17:23:41 2024 +0100 Update covers commit ae9c1f93fbdd26f2fd0270f6f3650eb6feb9440a Author: Jon Surrell Date: Tue Jan 23 17:35:41 2024 +0100 Use DIR_TESTDATA commit 34351ee74c307189785905b0b16ebe80e810c01f Author: Jon Surrell Date: Sun Jan 21 21:12:25 2024 +0100 Ignore another P tag test commit a9c1d86fe6769f979da1769afa1bc46835ebed16 Author: Jon Surrell Date: Wed Jan 17 16:19:08 2024 +0100 Add skip for known bug - all tests passing or skipped commit bf011e420adb2b54a197d11ffeea95003e291278 Author: Jon Surrell Date: Wed Jan 17 16:15:05 2024 +0100 Rename class and test function commit 9ae1f5177379b970606cee4e976f1123be06bc10 Author: Jon Surrell Date: Wed Jan 17 16:11:42 2024 +0100 Fixing more lints commit ab3a7279fc916ce932c8e3ff4347e57c653a6c61 Author: Jon Surrell Date: Tue Jan 16 22:44:52 2024 +0100 Clean up and refactor test document parsing commit 7e65fccdb310c40eec2ae2ffde7f5c970aa603c5 Author: Jon Surrell Date: Tue Jan 16 22:12:18 2024 +0100 Add attributes to html5lib tests commit 9ffa44ad80ec45b235702deb7b69d4e07646fb77 Author: Jon Surrell Date: Tue Jan 16 15:08:26 2024 +0100 Fix lint commit d6c7334b1340a06fe4ec31303687fc68b073905c Author: Jon Surrell Date: Tue Jan 16 14:37:06 2024 +0100 Skip head tests commit 95e52d99e5100a7d3bd04ae41f6e2f6f5e57302e Author: Jon Surrell Date: Tue Jan 16 14:31:22 2024 +0100 Fix some comments commit 864ed430445b61f9d2669fd9451bbea60a04147d Author: Jon Surrell Date: Tue Jan 16 14:13:11 2024 +0100 Fix strlen paren bug commit a6ece6d439425132f60b723bcbbfdbe7b4c44aeb Author: Jon Surrell Date: Tue Jan 16 14:09:38 2024 +0100 Fix lints commit 9280fd89df51c75e188c9aa3ce9da958e220c02a Author: Jon Surrell Date: Tue Jan 16 14:09:31 2024 +0100 Mark unsupported markup tests as incomplete, not skipped commit 071b845a41be509b2fb8f20df45d5bf894a69b81 Author: Jon Surrell Date: Tue Jan 16 14:06:10 2024 +0100 Skip incomplete token tests commit 46102df36c0a0ba238dab87ab74350e031fc2bd4 Author: Jon Surrell Date: Tue Jan 16 14:01:29 2024 +0100 Update ignores commit fc56850d934ca3d328f2311ed24d29b085dc205d Author: Jon Surrell Date: Mon Jan 15 21:35:50 2024 +0100 Fix HTML input processing commit 725cbbc9b4346b2e91707a2d288cf99e7605c5a4 Author: Jon Surrell Date: Mon Jan 15 19:05:22 2024 +0100 Use padded line number Allows filetering like line0001 so not line1 line10 line11… commit b35833c12a9a8179fa04412d72343b091cd8c2a1 Author: Jon Surrell Date: Mon Jan 15 18:31:27 2024 +0100 Use line numbers for test IDs Line numbers are stable even if we skip tests commit 146fa7fe86c1790a2abd7977571661b5ed5b44f3 Author: Jon Surrell Date: Fri Dec 22 17:38:19 2023 +0100 Avoid running tests that expect anything in commit 215648cce6e8fd1161a07c7e66abf7d678dc7df6 Author: Dennis Snell Date: Wed Dec 20 10:49:50 2023 -0600 Add extra skipped tests commit 47794b6a713ab0c92a05f0270a0a36c6aa3c9427 Author: Jon Surrell Date: Wed Dec 20 13:22:07 2023 +0100 Fix expect/actual ordering, add test message commit 57095fcc32464df3ff8a069443f854524f74006e Author: Jon Surrell Date: Tue Dec 19 20:20:26 2023 +0100 Move test data to test data dir commit 9d1ab0df79431f21ce739f4ce3bc05769485de7e Author: Jon Surrell Date: Tue Dec 19 18:32:55 2023 +0100 Add ignores for formatting elements commit ffa71f1bcc99dd56fc3b703d430b3723d55ef940 Author: Jon Surrell Date: Tue Dec 19 18:25:56 2023 +0100 Fix lint commit 46564ce7c8622ef4aee69eec339bdc656cb5f1bd Author: Jon Surrell Date: Tue Dec 19 18:16:09 2023 +0100 Add files crediting html5lib-tests project commit 860ab5415565037b1b3aafc01e2c2379d5d7031b Author: Jon Surrell Date: Tue Dec 19 18:06:37 2023 +0100 Add skipping of certain tests commit 04d94abadaf83c5e73476f6b765e77ea08f5c4fd Author: Jon Surrell Date: Tue Dec 19 15:30:03 2023 +0100 Remove space from test identifier, easier copy/paste filtering commit 210e7ff0003a72693ad6742133b99d391cc7e94c Author: Jon Surrell Date: Tue Dec 19 15:20:47 2023 +0100 Better tag finding commit 82a6e95f93399407dfcc4efd49040db45df2d465 Author: Jon Surrell Date: Tue Dec 19 14:07:30 2023 +0100 Print nicer tests names commit 44a8369b6a71ec03b6c3fbec04e95f8440cd824a Author: Jon Surrell Date: Tue Dec 19 13:59:21 2023 +0100 Skip doctype and comments in test dom tree commit 552c68e198ee490bd3ddbe7a3280aa4a81536b59 Author: Jon Surrell Date: Tue Dec 19 13:58:57 2023 +0100 1-index test case numbering commit 89191b8a21db3af4d3d34daab2f64c0f87b6cbf6 Author: Dennis Snell Date: Mon Dec 18 16:20:41 2023 -0600 WPCS Nags commit f2b77299d090d4c41e2da399b547a7b8eb7a60a0 Author: Dennis Snell Date: Mon Dec 18 15:57:53 2023 -0600 Add line number to test case label commit 8801e55090614e523c671db5c129dc214a861c0c Author: Dennis Snell Date: Mon Dec 18 15:23:47 2023 -0600 Avoid WPCS lint nags; skip tests for unsupported input or fragment context. commit e81776d13d68ce051c3f77073ad7dbfefebd5283 Author: Jon Surrell Date: Mon Dec 18 21:38:12 2023 +0100 Skip unhandled tests commit 0c69dd82843188eb52edc02d7a138197d8269ad4 Author: Jon Surrell Date: Mon Dec 18 21:22:51 2023 +0100 fix lints commit 715ea111b05b65478f7363506e3234234c69a03e Author: Jon Surrell Date: Mon Dec 18 21:18:04 2023 +0100 Move html5lib tests to new class commit b109b45a8e0d155be278f36207e0d3fa6f9a6707 Author: Jon Surrell Date: Mon Dec 18 21:13:49 2023 +0100 Remove git files from html5lib commit f7e02c33d83302a33ff04309106ccb8f7653d099 Author: Jon Surrell Date: Mon Dec 18 20:26:12 2023 +0100 Add test cases from html5lib-tests tree-construction --- tests/phpunit/data/html5lib-tests/AUTHORS.rst | 34 + tests/phpunit/data/html5lib-tests/LICENSE | 21 + tests/phpunit/data/html5lib-tests/README.md | 25 + .../tree-construction/README.md | 108 + .../tree-construction/adoption01.dat | 354 +++ .../tree-construction/adoption02.dat | 39 + .../tree-construction/blocks.dat | 695 +++++ .../tree-construction/comments01.dat | 217 ++ .../tree-construction/doctype01.dat | 474 +++ .../tree-construction/domjs-unsafe.dat | Bin 0 -> 10356 bytes .../tree-construction/entities01.dat | 943 ++++++ .../tree-construction/entities02.dat | 309 ++ .../tree-construction/foreign-fragment.dat | 645 ++++ .../tree-construction/html5test-com.dat | 301 ++ .../tree-construction/inbody01.dat | 54 + .../tree-construction/isindex.dat | 49 + .../tree-construction/main-element.dat | 46 + .../html5lib-tests/tree-construction/math.dat | 104 + .../tree-construction/menuitem-element.dat | 240 ++ .../namespace-sensitivity.dat | 22 + .../tree-construction/noscript01.dat | 237 ++ ...pending-spec-changes-plain-text-unsafe.dat | Bin 0 -> 927 bytes .../pending-spec-changes.dat | 46 + .../tree-construction/plain-text-unsafe.dat | Bin 0 -> 9486 bytes .../tree-construction/quirks01.dat | 53 + .../html5lib-tests/tree-construction/ruby.dat | 302 ++ .../tree-construction/scriptdata01.dat | 372 +++ .../tree-construction/search-element.dat | 46 + .../html5lib-tests/tree-construction/svg.dat | 104 + .../tree-construction/tables01.dat | 322 ++ .../tree-construction/template.dat | 1673 +++++++++++ .../tree-construction/tests1.dat | 1956 +++++++++++++ .../tree-construction/tests10.dat | 849 ++++++ .../tree-construction/tests11.dat | 523 ++++ .../tree-construction/tests12.dat | 62 + .../tree-construction/tests14.dat | 75 + .../tree-construction/tests15.dat | 216 ++ .../tree-construction/tests16.dat | 2602 +++++++++++++++++ .../tree-construction/tests17.dat | 179 ++ .../tree-construction/tests18.dat | 558 ++++ .../tree-construction/tests19.dat | 1398 +++++++++ .../tree-construction/tests2.dat | 831 ++++++ .../tree-construction/tests20.dat | 842 ++++++ .../tree-construction/tests21.dat | 306 ++ .../tree-construction/tests22.dat | 190 ++ .../tree-construction/tests23.dat | 168 ++ .../tree-construction/tests24.dat | 79 + .../tree-construction/tests25.dat | 288 ++ .../tree-construction/tests26.dat | 453 +++ .../tree-construction/tests3.dat | 305 ++ .../tree-construction/tests4.dat | 74 + .../tree-construction/tests5.dat | 210 ++ .../tree-construction/tests6.dat | 663 +++++ .../tree-construction/tests7.dat | 453 +++ .../tree-construction/tests8.dat | 165 ++ .../tree-construction/tests9.dat | 472 +++ .../tree-construction/tests_innerHTML_1.dat | 843 ++++++ .../tree-construction/tricky01.dat | 336 +++ .../tree-construction/webkit01.dat | 785 +++++ .../tree-construction/webkit02.dat | 554 ++++ .../html-api/wpHtmlProcessorHtml5lib.php | 336 +++ 61 files changed, 24606 insertions(+) create mode 100644 tests/phpunit/data/html5lib-tests/AUTHORS.rst create mode 100644 tests/phpunit/data/html5lib-tests/LICENSE create mode 100644 tests/phpunit/data/html5lib-tests/README.md create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/README.md create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/adoption01.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/adoption02.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/blocks.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/comments01.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/doctype01.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/domjs-unsafe.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/entities01.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/entities02.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/foreign-fragment.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/html5test-com.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/inbody01.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/isindex.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/main-element.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/math.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/menuitem-element.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/namespace-sensitivity.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/noscript01.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/pending-spec-changes-plain-text-unsafe.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/pending-spec-changes.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/plain-text-unsafe.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/quirks01.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/ruby.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/scriptdata01.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/search-element.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/svg.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tables01.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/template.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests1.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests10.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests11.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests12.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests14.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests15.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests16.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests17.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests18.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests19.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests2.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests20.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests21.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests22.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests23.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests24.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests25.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests26.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests3.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests4.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests5.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests6.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests7.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests8.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests9.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tests_innerHTML_1.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/tricky01.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/webkit01.dat create mode 100644 tests/phpunit/data/html5lib-tests/tree-construction/webkit02.dat create mode 100644 tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php diff --git a/tests/phpunit/data/html5lib-tests/AUTHORS.rst b/tests/phpunit/data/html5lib-tests/AUTHORS.rst new file mode 100644 index 0000000000000..4a7de17ad456c --- /dev/null +++ b/tests/phpunit/data/html5lib-tests/AUTHORS.rst @@ -0,0 +1,34 @@ +Credits +======= + +The ``html5lib`` test data is maintained by: + +- James Graham +- Geoffrey Sneddon + + +Contributors +------------ + +- Adam Barth +- Andi Sidwell +- Anne van Kesteren +- David Flanagan +- Edward Z. Yang +- Geoffrey Sneddon +- Henri Sivonen +- Ian Hickson +- Jacques Distler +- James Graham +- Lachlan Hunt +- lantis63 +- Mark Pilgrim +- Mats Palmgren +- Ms2ger +- Nolan Waite +- Philip Taylor +- Rafael Weinstein +- Ryan King +- Sam Ruby +- Simon Pieters +- Thomas Broyer diff --git a/tests/phpunit/data/html5lib-tests/LICENSE b/tests/phpunit/data/html5lib-tests/LICENSE new file mode 100644 index 0000000000000..8812371b41cfc --- /dev/null +++ b/tests/phpunit/data/html5lib-tests/LICENSE @@ -0,0 +1,21 @@ +Copyright (c) 2006-2013 James Graham, Geoffrey Sneddon, and +other contributors + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/tests/phpunit/data/html5lib-tests/README.md b/tests/phpunit/data/html5lib-tests/README.md new file mode 100644 index 0000000000000..be775c8b497b5 --- /dev/null +++ b/tests/phpunit/data/html5lib-tests/README.md @@ -0,0 +1,25 @@ +# html5lib-tests + +This directory contains a third-party test suite used for testing the WordPress HTML API. + +`html5lib-tests` can be found on GitHub at [html5lib/html5lib-tests](https://github.com/html5lib/html5lib-tests). + +The necessary files have been copied to this directory: + +- `AUTHORS.rst` +- `LICENSE` +- `README.md` +- `tree-construction/README.md` +- `tree-construction/*.dat` + +The version of these files was taken from the git commit with +SHA [`a9f44960a9fedf265093d22b2aa3c7ca123727b9`](https://github.com/html5lib/html5lib-tests/commit/a9f44960a9fedf265093d22b2aa3c7ca123727b9). + +## Updating + +If there have been changes to the html5lib-tests repository, this test suite can be updated. In +order to update: + +1. Check out the latest version of git repository mentioned above. +1. Copy the files listed above into this directory. +1. Update the SHA mentioned in this README file with the new html5lib-tests SHA. diff --git a/tests/phpunit/data/html5lib-tests/tree-construction/README.md b/tests/phpunit/data/html5lib-tests/tree-construction/README.md new file mode 100644 index 0000000000000..4737a3a867e86 --- /dev/null +++ b/tests/phpunit/data/html5lib-tests/tree-construction/README.md @@ -0,0 +1,108 @@ +Tree Construction Tests +======================= + +Each file containing tree construction tests consists of any number of +tests separated by two newlines (LF) and a single newline before the end +of the file. For instance: + + [TEST]LF + LF + [TEST]LF + LF + [TEST]LF + +Where [TEST] is the following format: + +Each test must begin with a string "\#data" followed by a newline (LF). +All subsequent lines until a line that says "\#errors" are the test data +and must be passed to the system being tested unchanged, except with the +final newline (on the last line) removed. + +Then there must be a line that says "\#errors". It must be followed by +one line per parse error that a conformant checker would return. It +doesn't matter what those lines are, although they can't be +"\#new-errors", "\#document-fragment", "\#document", "\#script-off", +"\#script-on", or empty, the only thing that matters is that there be +the right number of parse errors. + +Then there \*may\* be a line that says "\#new-errors", which works like +the "\#errors" section adding more errors to the expected number of +errors. + +Then there \*may\* be a line that says "\#document-fragment", which must +be followed by a newline (LF), followed by a string of characters that +indicates the context element, followed by a newline (LF). If the string +of characters starts with "svg ", the context element is in the SVG +namespace and the substring after "svg " is the local name. If the +string of characters starts with "math ", the context element is in the +MathML namespace and the substring after "math " is the local name. +Otherwise, the context element is in the HTML namespace and the string +is the local name. If this line is present the "\#data" must be parsed +using the HTML fragment parsing algorithm with the context element as +context. + +Then there \*may\* be a line that says "\#script-off" or +"\#script-on". If a line that says "\#script-off" is present, the +parser must set the scripting flag to disabled. If a line that says +"\#script-on" is present, it must set it to enabled. Otherwise, the +test should be run in both modes. + +Then there must be a line that says "\#document", which must be followed +by a dump of the tree of the parsed DOM. Each node must be represented +by a single line. Each line must start with "| ", followed by two spaces +per parent node that the node has before the root document node. + +- Element nodes must be represented by a "`<`" then the *tag name + string* "`>`", and all the attributes must be given, sorted + lexicographically by UTF-16 code unit according to their *attribute + name string*, on subsequent lines, as if they were children of the + element node. +- Attribute nodes must have the *attribute name string*, then an "=" + sign, then the attribute value in double quotes ("). +- Text nodes must be the string, in double quotes. Newlines aren't + escaped. +- Comments must be "`<`" then "`!-- `" then the data then "` -->`". +- DOCTYPEs must be "``". +- Processing instructions must be "``". (The HTML parser cannot emit + processing instructions, but scripts can, and the WebVTT to DOM + rules can emit them.) +- Template contents are represented by the string "content" with the + children below it. + +The *tag name string* is the local name prefixed by a namespace +designator. For the HTML namespace, the namespace designator is the +empty string, i.e. there's no prefix. For the SVG namespace, the +namespace designator is "svg ". For the MathML namespace, the namespace +designator is "math ". + +The *attribute name string* is the local name prefixed by a namespace +designator. For no namespace, the namespace designator is the empty +string, i.e. there's no prefix. For the XLink namespace, the namespace +designator is "xlink ". For the XML namespace, the namespace designator +is "xml ". For the XMLNS namespace, the namespace designator is "xmlns +". Note the difference between "xlink:href" which is an attribute in no +namespace with the local name "xlink:href" and "xlink href" which is an +attribute in the xlink namespace with the local name "href". + +If there is also a "\#document-fragment" the bit following "\#document" +must be a representation of the HTML fragment serialization for the +context element given by "\#document-fragment". + +For example: + + #data +

One

Two + #errors + 3: Missing document type declaration + #document + | + | + | + |

+ | "One" + |

+ | "Two" diff --git a/tests/phpunit/data/html5lib-tests/tree-construction/adoption01.dat b/tests/phpunit/data/html5lib-tests/tree-construction/adoption01.dat new file mode 100644 index 0000000000000..38f98efded0ae --- /dev/null +++ b/tests/phpunit/data/html5lib-tests/tree-construction/adoption01.dat @@ -0,0 +1,354 @@ +#data +

+#errors +(1,3): expected-doctype-but-got-start-tag +(1,10): adoption-agency-1.3 +#document +| +| +| +| +|

+| + +#data +1

23

+#errors +(1,3): expected-doctype-but-got-start-tag +(1,12): adoption-agency-1.3 +#document +| +| +| +| +| "1" +|

+| +| "2" +| "3" + +#data +1 +#errors +(1,3): expected-doctype-but-got-start-tag +(1,17): adoption-agency-1.3 +#document +| +| +| +| +| "1" +|