From fa55c370d1a184c8f210c988bc4366f497e6b9b8 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 15 Apr 2024 16:25:03 +0200 Subject: [PATCH 01/12] KSES: Preserve some additional invalid HTML comment syntaxes. When `wp_kses_split` processes a document it attempts to leave HTML comments relatively alone. It makes minor adjustments, but leaves the comments in the document in its output. Unfortunately it only recognizes one kind of HTML comment and rejects many other kinds which appear as the result of various invalid HTML markup. This patch makes a minor adjustment to the algorithm in `wp_kses_split` to allow two additional kinds of HTML comments: - HTML comments with the incorrect closer `--!>`. - Closing tags with an invalid tag name, e.g. ``. In an HTML parser these all become comments, and so leaving them in the document should be a benign operation, improving the reliability of detecting comments in Core. These invalid closing tags, which in a browser are interpreted as comments, are one proposal for a placeholder mechanism in the HTML API unlocking HTML templating, a new kind of shortcode, and more. Having these persist in Core is a requirement for exploring and utilizing the new syntax. --- src/wp-includes/kses.php | 43 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/kses.php b/src/wp-includes/kses.php index cccb1768c2dfd..f82202df12bdd 100644 --- a/src/wp-includes/kses.php +++ b/src/wp-includes/kses.php @@ -963,6 +963,7 @@ function wp_kses_version() { * It also matches stray `>` characters. * * @since 1.0.0 + * @since 6.6.0 Allow certain kinds of invalid HTML comments. * * @global array[]|string $pass_allowed_html An array of allowed HTML elements and attributes, * or a context name such as 'post'. @@ -981,7 +982,18 @@ function wp_kses_split( $content, $allowed_html, $allowed_protocols ) { $pass_allowed_html = $allowed_html; $pass_allowed_protocols = $allowed_protocols; - return preg_replace_callback( '%(|$))|(<[^>]*(>|$)|>)%', '_wp_kses_split_callback', $content ); + $token_pattern = <<|$)) # Well-formed HTML comments like `` and also with invalid `--!>` closer. + | + ]*> # Closing tags with invalid tag names. + ) + | + (<[^>]*(>|$)|>) # Tag-like spans of text. +~sx +REGEX; + return preg_replace_callback( $token_pattern, '_wp_kses_split_callback', $content ); } /** @@ -1080,12 +1092,37 @@ function _wp_kses_split_callback( $matches ) { function wp_kses_split2( $content, $allowed_html, $allowed_protocols ) { $content = wp_kses_stripslashes( $content ); - // It matched a ">" character. + /* + * The regex pattern used to split HTML into chunks attempts + * to split on HTML token boundaries. This function should + * thus receive chunks that _either_ start with meaningful + * syntax tokens, like a tag `
` or a comment ``. + * + * If the first character of the `$content` chunk _isn't_ one + * of these syntax elements, which always starts with `<`, then + * the match had to be for the final alternation of `>`. In such + * case, it's probably standing on its own and could be encoded + * with a character reference to remove ambiguity. + * + * In other words, if this chunk isn't from a match of a syntax + * token, it's just a plaintext greater-than (`>`) sign. + */ if ( ! str_starts_with( $content, '<' ) ) { return '>'; } - // Allow HTML comments. + /* + * Preserve funky comments. + * + * When a closing tag appears with a name that isn't a valid tag name, + * it should be interpreted as an HTML comment. It extends until the + * first `>` character after the initial opening `]*>~', $content ) ) { + return $content; + } + + // Preserve HTML comments. if ( str_starts_with( $content, '' ), '', $content ); From 0601582307e8a5b95e16764cf8715314b05dbc5f Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 16 Apr 2024 10:41:49 +0200 Subject: [PATCH 02/12] Replace the invalid comment closer as well. --- src/wp-includes/kses.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/kses.php b/src/wp-includes/kses.php index f82202df12bdd..9e7142c22b042 100644 --- a/src/wp-includes/kses.php +++ b/src/wp-includes/kses.php @@ -1124,7 +1124,7 @@ function wp_kses_split2( $content, $allowed_html, $allowed_protocols ) { // Preserve HTML comments. if ( str_starts_with( $content, '' ), '', $content ); + $content = str_replace( array( '', '--!>' ), '', $content ); while ( ( $newstring = wp_kses( $content, $allowed_html, $allowed_protocols ) ) !== $content ) { $content = $newstring; From fd66dda793f01f0381616aa9ed7e083875763349 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 24 Apr 2024 12:41:35 +0300 Subject: [PATCH 03/12] Remove kses test verifying guard against IE conditional comments. --- tests/phpunit/tests/kses.php | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/phpunit/tests/kses.php b/tests/phpunit/tests/kses.php index 12a7bbac29a42..24fe317b33a50 100644 --- a/tests/phpunit/tests/kses.php +++ b/tests/phpunit/tests/kses.php @@ -337,6 +337,11 @@ public function test_wp_kses_bad_protocol() { } } + /** + * Ensures protection against specific known XSS attacks. + * + * @since 6.6.0 Removed guard against IE conditional comments, since they are not interpreted as they once were. + */ public function test_hackers_attacks() { $xss = simplexml_load_file( DIR_TESTDATA . '/formatting/xssAttacks.xml' ); foreach ( $xss->attack as $attack ) { @@ -408,9 +413,6 @@ public function test_hackers_attacks() { case 'XML HTML+TIME': $this->assertSame( '<t:set attributeName="innerHTML" to="XSSalert(\'XSS\')">', $result ); break; - case 'Commented-out Block': - $this->assertSame( "\nalert('XSS');", $result ); - break; case 'Cookie Manipulation': $this->assertSame( '<META HTTP-EQUIV="Set-Cookie" Content="USERID=alert(\'XSS\')">', $result ); break; From e0e36b00bb7feb19e62a12cf9ae9ed62a02298f4 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 12 Jun 2024 13:06:06 +0200 Subject: [PATCH 04/12] Ensure PCRE pattern matches full string match. --- src/wp-includes/kses.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/kses.php b/src/wp-includes/kses.php index 9e7142c22b042..a1cfcb2ad2d14 100644 --- a/src/wp-includes/kses.php +++ b/src/wp-includes/kses.php @@ -1118,7 +1118,7 @@ function wp_kses_split2( $content, $allowed_html, $allowed_protocols ) { * it should be interpreted as an HTML comment. It extends until the * first `>` character after the initial opening `]*>~', $content ) ) { + if ( 1 === preg_match( '~^]*>$~', $content ) ) { return $content; } From cc39a887365e1a2c95791c53e1526dbc45989277 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 12 Jun 2024 15:50:04 +0200 Subject: [PATCH 05/12] Run comment content through kses --- src/wp-includes/kses.php | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/wp-includes/kses.php b/src/wp-includes/kses.php index a1cfcb2ad2d14..4ec7bad60e3e0 100644 --- a/src/wp-includes/kses.php +++ b/src/wp-includes/kses.php @@ -1112,17 +1112,28 @@ function wp_kses_split2( $content, $allowed_html, $allowed_protocols ) { } /* - * Preserve funky comments. - * * When a closing tag appears with a name that isn't a valid tag name, - * it should be interpreted as an HTML comment. It extends until the + * it must be interpreted as an HTML comment. It extends until the * first `>` character after the initial opening `]*>$~', $content ) ) { - return $content; + $content = substr( $content, 2, -1 ); + $transformed = null; + + while ( $transformed !== $content ) { + $transformed = wp_kses( $content, $allowed_html, $allowed_protocols ); + $content = $transformed; + } + + return ""; } - // Preserve HTML comments. + /* + * Normative HTML comments should be handled separately as their + * parsing rules differ from those for tags and text nodes. + */ if ( str_starts_with( $content, '', '--!>' ), '', $content ); From 5eacbc35deda3af9c0babaeed953f5e3d0862de6 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 12 Jun 2024 16:00:44 +0200 Subject: [PATCH 06/12] Bring invalid test back, update the expected output. --- tests/phpunit/tests/kses.php | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/phpunit/tests/kses.php b/tests/phpunit/tests/kses.php index 24fe317b33a50..f4a8a8ea3c904 100644 --- a/tests/phpunit/tests/kses.php +++ b/tests/phpunit/tests/kses.php @@ -413,6 +413,9 @@ public function test_hackers_attacks() { case 'XML HTML+TIME': $this->assertSame( '<t:set attributeName="innerHTML" to="XSSalert(\'XSS\')">', $result ); break; + case 'Deprecated IE Conditional Comment Syntax': + $this->assertSame( "", $result ); + break; case 'Cookie Manipulation': $this->assertSame( '<META HTTP-EQUIV="Set-Cookie" Content="USERID=alert(\'XSS\')">', $result ); break; From 7c444827c8427ef0c4665e77623aeb94a361c444 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 12 Jun 2024 17:39:48 +0200 Subject: [PATCH 07/12] Remove additional fix. --- src/wp-includes/kses.php | 2 +- tests/phpunit/data/formatting/xssAttacks.xml | 6 +++--- tests/phpunit/tests/kses.php | 9 ++------- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/wp-includes/kses.php b/src/wp-includes/kses.php index 4ec7bad60e3e0..bff110747e3ba 100644 --- a/src/wp-includes/kses.php +++ b/src/wp-includes/kses.php @@ -991,7 +991,7 @@ function wp_kses_split( $content, $allowed_html, $allowed_protocols ) { ) | (<[^>]*(>|$)|>) # Tag-like spans of text. -~sx +~x REGEX; return preg_replace_callback( $token_pattern, '_wp_kses_split_callback', $content ); } diff --git a/tests/phpunit/data/formatting/xssAttacks.xml b/tests/phpunit/data/formatting/xssAttacks.xml index 42f3dd12806f8..a1861b19e5f34 100644 --- a/tests/phpunit/data/formatting/xssAttacks.xml +++ b/tests/phpunit/data/formatting/xssAttacks.xml @@ -376,9 +376,9 @@ xss:&#101;x&#x2F;*XSS*//*/*/pression(alert("XSS"))'> Commented-out Block - <!--[if gte IE 4]> -<SCRIPT>alert('XSS');</SCRIPT> -<![endif]--> + + +]]> Downlevel-Hidden block (only works in IE5.0 and later and Netscape 8.1 in IE rendering engine mode). Some websites consider anything inside a comment block to be safe and therefore it does not need to be removed, which allows our XSS vector. Or the system could add comment tags around something to attempt to render it harmless. As we can see, that probably wouldn't do the job. Browser support: [<span class="s">IE6.0</span>|<span class="s">NS8.1-IE</span>] [<span class="ns">NS8.1-G</span>|<span class="ns">FF1.5</span>] [<span class="ns">O8.54</span>] diff --git a/tests/phpunit/tests/kses.php b/tests/phpunit/tests/kses.php index f4a8a8ea3c904..12a7bbac29a42 100644 --- a/tests/phpunit/tests/kses.php +++ b/tests/phpunit/tests/kses.php @@ -337,11 +337,6 @@ public function test_wp_kses_bad_protocol() { } } - /** - * Ensures protection against specific known XSS attacks. - * - * @since 6.6.0 Removed guard against IE conditional comments, since they are not interpreted as they once were. - */ public function test_hackers_attacks() { $xss = simplexml_load_file( DIR_TESTDATA . '/formatting/xssAttacks.xml' ); foreach ( $xss->attack as $attack ) { @@ -413,8 +408,8 @@ public function test_hackers_attacks() { case 'XML HTML+TIME': $this->assertSame( '<t:set attributeName="innerHTML" to="XSSalert(\'XSS\')">', $result ); break; - case 'Deprecated IE Conditional Comment Syntax': - $this->assertSame( "", $result ); + case 'Commented-out Block': + $this->assertSame( "\nalert('XSS');", $result ); break; case 'Cookie Manipulation': $this->assertSame( '<META HTTP-EQUIV="Set-Cookie" Content="USERID=alert(\'XSS\')">', $result ); From b60d0e02ca3ee406830b7e4942ad73845ff8469c Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 12 Jun 2024 17:47:14 +0200 Subject: [PATCH 08/12] Remove other fixes. --- src/wp-includes/kses.php | 2 +- tests/phpunit/data/formatting/xssAttacks.xml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/kses.php b/src/wp-includes/kses.php index bff110747e3ba..46209b9da8968 100644 --- a/src/wp-includes/kses.php +++ b/src/wp-includes/kses.php @@ -1135,7 +1135,7 @@ function wp_kses_split2( $content, $allowed_html, $allowed_protocols ) { * parsing rules differ from those for tags and text nodes. */ if ( str_starts_with( $content, '', '--!>' ), '', $content ); + $content = str_replace( array( '' ), '', $content ); while ( ( $newstring = wp_kses( $content, $allowed_html, $allowed_protocols ) ) !== $content ) { $content = $newstring; diff --git a/tests/phpunit/data/formatting/xssAttacks.xml b/tests/phpunit/data/formatting/xssAttacks.xml index a1861b19e5f34..42f3dd12806f8 100644 --- a/tests/phpunit/data/formatting/xssAttacks.xml +++ b/tests/phpunit/data/formatting/xssAttacks.xml @@ -376,9 +376,9 @@ xss:&#101;x&#x2F;*XSS*//*/*/pression(alert("XSS"))'> Commented-out Block - - -]]> + <!--[if gte IE 4]> +<SCRIPT>alert('XSS');</SCRIPT> +<![endif]--> Downlevel-Hidden block (only works in IE5.0 and later and Netscape 8.1 in IE rendering engine mode). Some websites consider anything inside a comment block to be safe and therefore it does not need to be removed, which allows our XSS vector. Or the system could add comment tags around something to attempt to render it harmless. As we can see, that probably wouldn't do the job. Browser support: [<span class="s">IE6.0</span>|<span class="s">NS8.1-IE</span>] [<span class="ns">NS8.1-G</span>|<span class="ns">FF1.5</span>] [<span class="ns">O8.54</span>] From 8a317029a3ca8490a12e7ee729c9da06daf55e4f Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 12 Jun 2024 17:52:48 +0200 Subject: [PATCH 09/12] Remove other fix, update docs. --- src/wp-includes/kses.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/kses.php b/src/wp-includes/kses.php index 46209b9da8968..68414fcf2b943 100644 --- a/src/wp-includes/kses.php +++ b/src/wp-includes/kses.php @@ -984,13 +984,13 @@ function wp_kses_split( $content, $allowed_html, $allowed_protocols ) { $token_pattern = <<|$)) # Well-formed HTML comments like `` and also with invalid `--!>` closer. + ( # Detect comments of various flavors before attempting to find tags. + (?:|$)) # - Normative HTML comments. | - ]*> # Closing tags with invalid tag names. + ]*> # - Closing tags with invalid tag names. ) | - (<[^>]*(>|$)|>) # Tag-like spans of text. + (<[^>]*(>|$)|>) # Tag-like spans of text. ~x REGEX; return preg_replace_callback( $token_pattern, '_wp_kses_split_callback', $content ); From 5dbbe50969e842bc5170eaf4eb39bdcc78d5ba5a Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 12 Jun 2024 17:59:29 +0200 Subject: [PATCH 10/12] PCRE pattern cleanup. --- src/wp-includes/kses.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/kses.php b/src/wp-includes/kses.php index 68414fcf2b943..0ddcdc611c54e 100644 --- a/src/wp-includes/kses.php +++ b/src/wp-includes/kses.php @@ -984,13 +984,13 @@ function wp_kses_split( $content, $allowed_html, $allowed_protocols ) { $token_pattern = <<|$)) # - Normative HTML comments. + ( # Detect comments of various flavors before attempting to find tags. + (|$)) # - Normative HTML comments. | - ]*> # - Closing tags with invalid tag names. + ]*> # - Closing tags with invalid tag names. ) | - (<[^>]*(>|$)|>) # Tag-like spans of text. + (<[^>]*(>|$)|>) # Tag-like spans of text. ~x REGEX; return preg_replace_callback( $token_pattern, '_wp_kses_split_callback', $content ); From dca7d00278d901de295662601c8c0a0bd16f799e Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 12 Jun 2024 18:08:25 +0200 Subject: [PATCH 11/12] Update @since tags --- src/wp-includes/kses.php | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/kses.php b/src/wp-includes/kses.php index 0ddcdc611c54e..3e4e086eead82 100644 --- a/src/wp-includes/kses.php +++ b/src/wp-includes/kses.php @@ -963,7 +963,7 @@ function wp_kses_version() { * It also matches stray `>` characters. * * @since 1.0.0 - * @since 6.6.0 Allow certain kinds of invalid HTML comments. + * @since 6.6.0 Recognize additional forms of invalid HTML which convert into comments. * * @global array[]|string $pass_allowed_html An array of allowed HTML elements and attributes, * or a context name such as 'post'. @@ -1081,12 +1081,14 @@ function _wp_kses_split_callback( $matches ) { * @access private * @ignore * @since 1.0.0 + * @since 6.6.0 Recognize additional forms of invalid HTML which convert into comments. * * @param string $content Content to filter. * @param array[]|string $allowed_html An array of allowed HTML elements and attributes, * or a context name such as 'post'. See wp_kses_allowed_html() * for the list of accepted context names. * @param string[] $allowed_protocols Array of allowed URL protocols. + * * @return string Fixed HTML element */ function wp_kses_split2( $content, $allowed_html, $allowed_protocols ) { From 5949d4cf57a41e2734c05841b191e264531dd39c Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 12 Jun 2024 22:03:24 +0200 Subject: [PATCH 12/12] Add a basic unit test to prevent regressions. --- tests/phpunit/tests/kses.php | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/phpunit/tests/kses.php b/tests/phpunit/tests/kses.php index 12a7bbac29a42..eb5d0a8004a95 100644 --- a/tests/phpunit/tests/kses.php +++ b/tests/phpunit/tests/kses.php @@ -1912,6 +1912,35 @@ public function filter_wp_kses_object_added_in_html_filter( $tags, $context ) { return $tags; } + /** + * Ensures that `wp_kses()` preserves various kinds of HTML comments, both valid and invalid. + * + * @ticket 61009 + * + * @param string $html_comment HTML containing a comment; must not be a valid comment + * but must be syntax which a browser interprets as a comment. + * @param string $expected_output How `wp_kses()` ought to transform the comment. + */ + public function wp_kses_preserves_html_comments( $html_comment, $expected_output ) { + $this->assertSame( + $expected_output, + wp_kses( $html_comment, array() ), + 'Failed to properly preserve HTML comment.' + ); + } + + /** + * Data provider. + * + * @return array[]. + */ + public static function data_html_containing_various_kinds_of_html_comments() { + return array( + 'Normative HTML comment' => array( 'beforeafter', 'beforeafter' ), + 'Closing tag with invalid tag name' => array( 'beforeafter', 'beforeafter' ), + ); + } + /** * Test that attributes with a list of allowed values are filtered correctly. *