diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index dd8e8d5f2d6e8..02c49f843358e 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -361,6 +361,10 @@ public function get_last_error() { public function next_tag( $query = null ) { if ( null === $query ) { while ( $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( ! $this->is_tag_closer() ) { return true; } @@ -384,6 +388,10 @@ public function next_tag( $query = null ) { if ( ! ( array_key_exists( 'breadcrumbs', $query ) && is_array( $query['breadcrumbs'] ) ) ) { while ( $this->step() ) { + if ( '#tag' !== $this->get_token_type() ) { + continue; + } + if ( ! $this->is_tag_closer() ) { return true; } @@ -414,27 +422,14 @@ public function next_tag( $query = null ) { } /** - * Ensures internal accounting is maintained for HTML semantic rules while - * the underlying Tag Processor class is seeking to a bookmark. - * - * This doesn't currently have a way to represent non-tags and doesn't process - * semantic rules for text nodes. For access to the raw tokens consider using - * WP_HTML_Tag_Processor instead. + * Advances to the next token in the document. * - * @since 6.5.0 Added for internal support; do not use. - * - * @access private + * @since 6.5.0 * - * @return bool + * @return bool Whether a token was found. */ public function next_token() { - $found_a_token = parent::next_token(); - - if ( '#tag' === $this->get_token_type() ) { - $this->step( self::PROCESS_CURRENT_NODE ); - } - - return $found_a_token; + return $this->step(); } /** @@ -495,7 +490,7 @@ public function matches_breadcrumbs( $breadcrumbs ) { } /** - * Steps through the HTML document and stop at the next tag, if any. + * Steps through the HTML document and stop at the next token, if any. * * @since 6.4.0 * @@ -505,7 +500,7 @@ public function matches_breadcrumbs( $breadcrumbs ) { * @see self::REPROCESS_CURRENT_NODE * * @param string $node_to_process Whether to parse the next node or reprocess the current node. - * @return bool Whether a tag was matched. + * @return bool Whether a token was matched. */ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { // Refuse to proceed if there was a previous error. @@ -528,31 +523,82 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { * special since it's only self-closing if the self-closing flag * is provided in the opening tag, otherwise it expects a tag closer. */ - $top_node = $this->state->stack_of_open_elements->current_node(); - if ( $top_node && self::is_void( $top_node->node_name ) ) { + $bottom_node = $this->state->stack_of_open_elements->current_node(); + if ( + $bottom_node && + ( + ( 'html' !== $bottom_node->namespace && $bottom_node->has_self_closing_flag ) || + self::is_void( $bottom_node->node_name ) || + $bottom_node->node_name[0] < 'A' || + $bottom_node->node_name[0] > 'Z' + ) + ) { $this->state->stack_of_open_elements->pop(); } } + $token_name = $this->get_token_name(); + + // @todo This is the context node in a fragment but it seems to be the same in practice. + $adjusted_current_node = $this->state->stack_of_open_elements->current_node(); + + // @see https://html.spec.whatwg.org/#tree-construction-dispatcher + $parse_in_foreign_content = ! ( + ! $adjusted_current_node || + 'html' === $adjusted_current_node->namespace || + ( + 'math' === $adjusted_current_node->integration_node_type && + ! $this->is_tag_closer() && + 'MGLYPH' !== $token_name && + 'MALIGNMARK' !== $token_name + ) || + ( + 'math' === $adjusted_current_node->integration_node_type && + '#text' === $token_name + ) || + ( + 'math' === $adjusted_current_node->integration_node_type && + 'ANNOTATION-XML' === $adjusted_current_node->node_name && + ! $this->is_tag_closer() && + 'SVG' === $token_name + ) || + ( + 'html' === $adjusted_current_node->integration_node_type && + ! $this->is_tag_closer() + ) || + ( + 'html' === $adjusted_current_node->integration_node_type && + '#text' === $token_name + ) + ); + + $this->is_inside_foreign_content = $parse_in_foreign_content; + if ( self::PROCESS_NEXT_NODE === $node_to_process ) { - while ( parent::next_token() && '#tag' !== $this->get_token_type() ) { - continue; - } + parent::next_token(); + $token_name = $this->get_token_name(); } - // Finish stepping when there are no more tokens in the document. - if ( null === $this->get_tag() ) { + if ( + WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state || + WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state + ) { return false; } $this->state->current_token = new WP_HTML_Token( $this->bookmark_tag(), - $this->get_tag(), + $token_name, $this->has_self_closing_flag(), $this->release_internal_bookmark_on_destruct ); try { + + if ( $parse_in_foreign_content ) { + return $this->step_in_foreign_content(); + } + switch ( $this->state->insertion_mode ) { case WP_HTML_Processor_State::INSERTION_MODE_IN_BODY: return $this->step_in_body(); @@ -591,10 +637,6 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) { * @return string[]|null Array of tag names representing path to matched node, if matched, otherwise NULL. */ public function get_breadcrumbs() { - if ( ! $this->get_tag() ) { - return null; - } - $breadcrumbs = array(); foreach ( $this->state->stack_of_open_elements->walk_down() as $stack_item ) { $breadcrumbs[] = $stack_item->node_name; @@ -619,11 +661,37 @@ public function get_breadcrumbs() { * @return bool Whether an element was found. */ private function step_in_body() { - $tag_name = $this->get_tag(); - $op_sigil = $this->is_tag_closer() ? '-' : '+'; - $op = "{$op_sigil}{$tag_name}"; + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; switch ( $op ) { + case '#comment': + $this->insert_html_element( $this->state->current_token ); + return true; + + case 'html': + // Ignore DOCTYPE declarations. + return $this->step(); + + case '#text': + $text_at = $this->bookmarks[ $this->state->current_token->bookmark_name ]->start; + $text_length = $this->bookmarks[ $this->state->current_token->bookmark_name ]->length; + + if ( 1 === $text_length && "\x00" === $this->html[ $text_at ] ) { + // Ignore this token. + return $this->step(); + } + + if ( strspn( $this->html, " \t\n\f\r", $text_at, $text_length ) !== $text_length ) { + $this->state->frameset_ok = false; + } + + $this->reconstruct_active_formatting_elements(); + $this->insert_html_element( $this->state->current_token ); + return true; + /* * > A start tag whose tag name is "button" */ @@ -1025,6 +1093,24 @@ private function step_in_body() { case '+TRACK': $this->insert_html_element( $this->state->current_token ); return true; + + /* + * > A start tag whose tag name is "math" + */ + case '+MATH': + $this->reconstruct_active_formatting_elements(); + $this->state->current_token->namespace = 'math'; + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "svg" + */ + case '+SVG': + $this->reconstruct_active_formatting_elements(); + $this->state->current_token->namespace = 'svg'; + $this->insert_html_element( $this->state->current_token ); + return true; } /* @@ -1137,6 +1223,224 @@ private function step_in_body() { } } + /** + * @todo … + */ + public function get_namespace() { + switch ( $this->parser_state ) { + case self::STATE_MATCHED_TAG: + return $this->state->current_token->namespace; + default: + return null; + } + } + + /** + * Parses next element in foreign content. + * + * This internal function performs the 'in foreign content' logic + * for the generalized WP_HTML_Processor::step() function. This is + * not a distinct insertion mode. + * + * @since 6.5.0 + * + * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input. + * + * @see WP_HTML_Processor::step + * @see https://html.spec.whatwg.org/#parsing-main-inbody + * + * @return bool Whether an element was found. + */ + private function step_in_foreign_content() { + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + /* + * > A start tag whose name is "font", if the token has any attributes named "color", "face", or "size" + */ + if ( + '+FONT' === $op && + ( + null !== $this->get_attribute( 'color' ) || + null !== $this->get_attribute( 'face' ) || + null !== $this->get_attribute( 'size' ) + ) + ) { + // @todo Indicate a parse error once it's possible. + $current_node = $this->state->stack_of_open_elements->current_node(); + while ( $current_node && null === $current_node->integration_node_type && 'html' !== $current_node->namespace ) { + $this->state->stack_of_open_elements->pop(); + $current_node = $this->state->stack_of_open_elements->current_node(); + } + + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + switch ( $op ) { + case '#cdata-section': + case '#comment': + $this->insert_html_element( $this->state->current_token ); + return true; + + case 'html': + // Ignore DOCTYPE declarations. + return $this->step(); + + case '#text': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A start tag whose tag name is "b", "big", "blockquote", "body", "br", "center", + * > "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2", "h3", "h4", "h5", + * > "h6", "head", "hr", "i", "img", "li", "listing", "menu", "meta", "nobr", "ol", + * > "p", "pre", "ruby", "s", "small", "span", "strong", "strike", "sub", "sup", + * > "table", "tt", "u", "ul", "var" + * + * > An end tag whose tag name is "br", "p" + */ + case '+B': + case '+BIG': + case '+BLOCKQUOTE': + case '+BODY': + case '+BR': + case '+CENTER': + case '+CODE': + case '+DD': + case '+DIV': + case '+DL': + case '+DT': + case '+EM': + case '+EMBED': + case '+H1': + case '+H2': + case '+H3': + case '+H4': + case '+H5': + case '+H6': + case '+HEAD': + case '+HR': + case '+I': + case '+IMG': + case '+LI': + case '+LISTING': + case '+MENU': + case '+META': + case '+NOBR': + case '+OL': + case '+P': + case '+PRE': + case '+RUBY': + case '+S': + case '+SMALL': + case '+SPAN': + case '+STRONG': + case '+STRIKE': + case '+SUB': + case '+SUP': + case '+TABLE': + case '+TT': + case '+U': + case '+UL': + case '+VAR': + case '-BR': + case '-P': + // @todo Indicate a parse error once it's possible. + $current_node = $this->state->stack_of_open_elements->current_node(); + while ( $current_node && null === $current_node->integration_node_type && 'html' !== $current_node->namespace ) { + $this->state->stack_of_open_elements->pop(); + $current_node = $this->state->stack_of_open_elements->current_node(); + } + + return $this->step( self::REPROCESS_CURRENT_NODE ); + } + + /* + * > Any other start tag + */ + if ( ! $this->is_tag_closer() ) { + // @todo Adjust foreign attributes; this probably should be done in get_attribute(). + + // @todo This is the adjusted current node, will it ever not be this? + $current_node = $this->state->stack_of_open_elements->current_node(); + $this->state->current_token->namespace = $current_node->namespace; + + if ( + 'svg' === $current_node->namespace && + ( 'DESC' === $tag_name || 'FOREIGNOBJECT' === $tag_name || 'TITLE' === $tag_name ) + ) { + $this->state->current_token->integration_node_type = 'html'; + } elseif ( + 'math' === $current_node->namespace && + ( 'MI' === $tag_name || 'MO' === $tag_name || 'MN' === $tag_name || 'MS' === $tag_name || 'MTEXT' === $tag_name ) + ) { + $this->state->current_token->integration_node_type = 'math'; + } elseif ( 'math' === $current_node->namespace && 'ANNOTATION-XML' === $tag_name ) { + $encoding = $this->get_attribute( 'encoding' ); + + if ( is_string( $encoding ) ) { + $encoding = strtolower( $encoding ); + + if ( 'text/html' === $encoding || 'application/xhtml+xml' === $encoding ) { + $this->state->current_token->integration_node_type = 'html'; + } + } + } + + // @todo There should be a false `onlyAddToElementStack` parameter that does stuff. + $this->insert_html_element( $this->state->current_token ); + return true; + } + + /* + * > An end tag whose name is "script", if the current node is an SVG script element. + */ + // @todo Does this rule matter here? + + /* + * > Any other end tag + */ + if ( $this->is_tag_closer() ) { + $node = $this->state->stack_of_open_elements->current_node(); + if ( $tag_name !== $node->node_name ) { + // @todo Indicate a parse error once it's possible. + } + in_foreign_content_end_tag_loop: + if ( $node === $this->state->stack_of_open_elements->stack[0] ) { + // @todo should this return $this->step() instead? + return true; + } + + /* + * > If node's tag name, converted to ASCII lowercase, is the same as the tag name + * > of the token, pop elements from the stack of open elements until node has + * > been popped from the stack, and then return. + */ + if ( $node->node_name === $tag_name ) { + foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) { + $this->state->stack_of_open_elements->pop(); + if ( $node === $item ) { + // @todo should this return $this->step() instead? + return true; + } + } + } + + foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) { + $node = $item; + break; + } + + if ( 'html' !== $node->namespace ) { + goto in_foreign_content_end_tag_loop; + } + + return $this->step( self::PROCESS_CURRENT_NODE ); + } + } + /* * Internal helpers */ @@ -1151,10 +1455,6 @@ private function step_in_body() { * @return string|false Name of created bookmark, or false if unable to create. */ private function bookmark_tag() { - if ( ! $this->get_tag() ) { - return false; - } - if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) { $this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS; throw new Exception( 'could not allocate bookmark' ); @@ -1611,6 +1911,82 @@ private function insert_html_element( $token ) { * HTML Specification Helpers */ + /** + * Indicates if the current token is a MathML integration point. + * + * @since 6.5.0 + * + * @see https://html.spec.whatwg.org/#mathml-text-integration-point + * + * @return bool Whether the current token is a MathML integration point. + */ + private function is_mathml_integration_point() { + $token = $this->state->current_token; + + if ( 'math' !== $token->namespace || 'M' !== $token->node_name[0] ) { + return false; + } + + $tag_name = $token->node_name; + + return ( + 'MI' === $tag_name || + 'MO' === $tag_name || + 'MN' === $tag_name || + 'MS' === $tag_name || + 'MTEXT' === $tag_name + ); + } + + /** + * Indicates if the current token is an HTML integration point. + * + * Note that this method must be an instance method with access + * to the current token, since it needs to examine the attributes + * of the currently-matched tag, if it's in the MathML namespace. + * Otherwise it would be required to scan the HTML and ensure that + * no other accounting is overlooked. + * + * @since 6.5.0 + * + * @see https://html.spec.whatwg.org/#html-integration-point + * + * @return bool Whether the current token is an HTML integration point. + */ + private function is_html_integration_point() { + $token = $this->state->current_token; + + if ( 'html' === $token->namespace ) { + return false; + } + + $tag_name = $token->node_name; + + if ( 'svg' === $token->namespace ) { + return ( + 'DESC' === $tag_name || + 'FOREIGNOBJECT' === $tag_name || + 'TITLE' === $tag_name + ); + } + + if ( 'math' === $token->namespace ) { + if ( 'ANNOTATION-XML' !== $tag_name ) { + return false; + } + + $encoding = $this->get_attribute( 'encoding' ); + + return ( + is_string( $encoding ) && + ( + 0 === strcasecmp( $encoding, 'application/xhtml+xml' ) || + 0 === strcasecmp( $encoding, 'text/html' ) + ) + ); + } + } + /** * Returns whether an element of a given name is in the HTML special category. * diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index b437595bd9e4d..69e8fca3e63af 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -515,6 +515,21 @@ class WP_HTML_Tag_Processor { */ protected $parser_state = self::STATE_READY; + /** + * Indicates whether the parser is inside foreign content, + * e.g. inside an SVG or MathML element. + * + * Several parsing rules change based on whether the parser + * is inside foreign content, including whether CDATA sections + * are allowed and whether a self-closing flag indicates that + * an element has no content. + * + * @since 6.5.0 + * + * @var bool + */ + protected $is_inside_foreign_content = false; + /** * What kind of syntax token became an HTML comment. * @@ -944,7 +959,6 @@ public function next_token() { $duplicate_attributes = $this->duplicate_attributes; // Find the closing tag if necessary. - $found_closer = false; switch ( $tag_name ) { case 'SCRIPT': $found_closer = $this->skip_script_data(); @@ -1719,6 +1733,32 @@ private function parse_next_tag() { return true; } + if ( + $this->is_inside_foreign_content && + strlen( $html ) > $at + 8 && + '[' === $html[ $at + 2 ] && + 'C' === $html[ $at + 3 ] && + 'D' === $html[ $at + 4 ] && + 'A' === $html[ $at + 5 ] && + 'T' === $html[ $at + 6 ] && + 'A' === $html[ $at + 7 ] && + '[' === $html[ $at + 8 ] + ) { + $closer_at = strpos( $html, ']]>', $at + 1 ); + if ( false === $closer_at ) { + $this->parser_state = self::STATE_INCOMPLETE_INPUT; + + return false; + } + + $this->parser_state = self::STATE_CDATA_NODE; + $this->text_starts_at = $at + 9; + $this->text_length = $closer_at - $this->text_starts_at; + $this->token_length = $closer_at + 3 - $this->token_starts_at; + $this->bytes_already_parsed = $closer_at + 3; + return true; + } + /* * Anything else here is an incorrectly-opened comment and transitions * to the bogus comment state - skip to the nearest >. If no closer is diff --git a/src/wp-includes/html-api/class-wp-html-token.php b/src/wp-includes/html-api/class-wp-html-token.php index 86dd7658cfcee..ca0e4ed37f2bd 100644 --- a/src/wp-includes/html-api/class-wp-html-token.php +++ b/src/wp-includes/html-api/class-wp-html-token.php @@ -60,6 +60,24 @@ class WP_HTML_Token { */ public $has_self_closing_flag = false; + /** + * Indicates if the element is an HTML element or if it's inside foreign content. + * + * @since 6.5.0 + * + * @var string 'html', 'svg', or 'math'. + */ + public $namespace = 'html'; + + /** + * Indicates which kind of integration point the element is, if any. + * + * @since 6.5.0 + * + * @var string|null 'mathml', 'html', or null if not an integration point. + */ + public $integration_node_type = null; + /** * Called when token is garbage-collected or otherwise destroyed. * diff --git a/tests/phpunit/data/html5lib-tests/AUTHORS.rst b/tests/phpunit/data/html5lib-tests/AUTHORS.rst new file mode 100644 index 0000000000000..4a7de17ad456c --- /dev/null +++ b/tests/phpunit/data/html5lib-tests/AUTHORS.rst @@ -0,0 +1,34 @@ +Credits +======= + +The ``html5lib`` test data is maintained by: + +- James Graham +- Geoffrey Sneddon + + +Contributors +------------ + +- Adam Barth +- Andi Sidwell +- Anne van Kesteren +- David Flanagan +- Edward Z. Yang +- Geoffrey Sneddon +- Henri Sivonen +- Ian Hickson +- Jacques Distler +- James Graham +- Lachlan Hunt +- lantis63 +- Mark Pilgrim +- Mats Palmgren +- Ms2ger +- Nolan Waite +- Philip Taylor +- Rafael Weinstein +- Ryan King +- Sam Ruby +- Simon Pieters +- Thomas Broyer diff --git a/tests/phpunit/data/html5lib-tests/LICENSE b/tests/phpunit/data/html5lib-tests/LICENSE new file mode 100644 index 0000000000000..8812371b41cfc --- /dev/null +++ b/tests/phpunit/data/html5lib-tests/LICENSE @@ -0,0 +1,21 @@ +Copyright (c) 2006-2013 James Graham, Geoffrey Sneddon, and +other contributors + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/tests/phpunit/data/html5lib-tests/README.md b/tests/phpunit/data/html5lib-tests/README.md new file mode 100644 index 0000000000000..be775c8b497b5 --- /dev/null +++ b/tests/phpunit/data/html5lib-tests/README.md @@ -0,0 +1,25 @@ +# html5lib-tests + +This directory contains a third-party test suite used for testing the WordPress HTML API. + +`html5lib-tests` can be found on GitHub at [html5lib/html5lib-tests](https://github.com/html5lib/html5lib-tests). + +The necessary files have been copied to this directory: + +- `AUTHORS.rst` +- `LICENSE` +- `README.md` +- `tree-construction/README.md` +- `tree-construction/*.dat` + +The version of these files was taken from the git commit with +SHA [`a9f44960a9fedf265093d22b2aa3c7ca123727b9`](https://github.com/html5lib/html5lib-tests/commit/a9f44960a9fedf265093d22b2aa3c7ca123727b9). + +## Updating + +If there have been changes to the html5lib-tests repository, this test suite can be updated. In +order to update: + +1. Check out the latest version of git repository mentioned above. +1. Copy the files listed above into this directory. +1. Update the SHA mentioned in this README file with the new html5lib-tests SHA. diff --git a/tests/phpunit/data/html5lib-tests/tree-construction/README.md b/tests/phpunit/data/html5lib-tests/tree-construction/README.md new file mode 100644 index 0000000000000..4737a3a867e86 --- /dev/null +++ b/tests/phpunit/data/html5lib-tests/tree-construction/README.md @@ -0,0 +1,108 @@ +Tree Construction Tests +======================= + +Each file containing tree construction tests consists of any number of +tests separated by two newlines (LF) and a single newline before the end +of the file. For instance: + + [TEST]LF + LF + [TEST]LF + LF + [TEST]LF + +Where [TEST] is the following format: + +Each test must begin with a string "\#data" followed by a newline (LF). +All subsequent lines until a line that says "\#errors" are the test data +and must be passed to the system being tested unchanged, except with the +final newline (on the last line) removed. + +Then there must be a line that says "\#errors". It must be followed by +one line per parse error that a conformant checker would return. It +doesn't matter what those lines are, although they can't be +"\#new-errors", "\#document-fragment", "\#document", "\#script-off", +"\#script-on", or empty, the only thing that matters is that there be +the right number of parse errors. + +Then there \*may\* be a line that says "\#new-errors", which works like +the "\#errors" section adding more errors to the expected number of +errors. + +Then there \*may\* be a line that says "\#document-fragment", which must +be followed by a newline (LF), followed by a string of characters that +indicates the context element, followed by a newline (LF). If the string +of characters starts with "svg ", the context element is in the SVG +namespace and the substring after "svg " is the local name. If the +string of characters starts with "math ", the context element is in the +MathML namespace and the substring after "math " is the local name. +Otherwise, the context element is in the HTML namespace and the string +is the local name. If this line is present the "\#data" must be parsed +using the HTML fragment parsing algorithm with the context element as +context. + +Then there \*may\* be a line that says "\#script-off" or +"\#script-on". If a line that says "\#script-off" is present, the +parser must set the scripting flag to disabled. If a line that says +"\#script-on" is present, it must set it to enabled. Otherwise, the +test should be run in both modes. + +Then there must be a line that says "\#document", which must be followed +by a dump of the tree of the parsed DOM. Each node must be represented +by a single line. Each line must start with "| ", followed by two spaces +per parent node that the node has before the root document node. + +- Element nodes must be represented by a "`<`" then the *tag name + string* "`>`", and all the attributes must be given, sorted + lexicographically by UTF-16 code unit according to their *attribute + name string*, on subsequent lines, as if they were children of the + element node. +- Attribute nodes must have the *attribute name string*, then an "=" + sign, then the attribute value in double quotes ("). +- Text nodes must be the string, in double quotes. Newlines aren't + escaped. +- Comments must be "`<`" then "`!-- `" then the data then "` -->`". +- DOCTYPEs must be "``". +- Processing instructions must be "``". (The HTML parser cannot emit + processing instructions, but scripts can, and the WebVTT to DOM + rules can emit them.) +- Template contents are represented by the string "content" with the + children below it. + +The *tag name string* is the local name prefixed by a namespace +designator. For the HTML namespace, the namespace designator is the +empty string, i.e. there's no prefix. For the SVG namespace, the +namespace designator is "svg ". For the MathML namespace, the namespace +designator is "math ". + +The *attribute name string* is the local name prefixed by a namespace +designator. For no namespace, the namespace designator is the empty +string, i.e. there's no prefix. For the XLink namespace, the namespace +designator is "xlink ". For the XML namespace, the namespace designator +is "xml ". For the XMLNS namespace, the namespace designator is "xmlns +". Note the difference between "xlink:href" which is an attribute in no +namespace with the local name "xlink:href" and "xlink href" which is an +attribute in the xlink namespace with the local name "href". + +If there is also a "\#document-fragment" the bit following "\#document" +must be a representation of the HTML fragment serialization for the +context element given by "\#document-fragment". + +For example: + + #data +

One

Two + #errors + 3: Missing document type declaration + #document + | + | + | + |

+ | "One" + |

+ | "Two" diff --git a/tests/phpunit/data/html5lib-tests/tree-construction/adoption01.dat b/tests/phpunit/data/html5lib-tests/tree-construction/adoption01.dat new file mode 100644 index 0000000000000..38f98efded0ae --- /dev/null +++ b/tests/phpunit/data/html5lib-tests/tree-construction/adoption01.dat @@ -0,0 +1,354 @@ +#data +

+#errors +(1,3): expected-doctype-but-got-start-tag +(1,10): adoption-agency-1.3 +#document +| +| +| +| +|

+| + +#data +1

23

+#errors +(1,3): expected-doctype-but-got-start-tag +(1,12): adoption-agency-1.3 +#document +| +| +| +| +| "1" +|

+| +| "2" +| "3" + +#data +1 +#errors +(1,3): expected-doctype-but-got-start-tag +(1,17): adoption-agency-1.3 +#document +| +| +| +| +| "1" +|