From a6b66980b6f31168b6478228eaeb9313f2d1a5a7 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Fri, 27 Jun 2025 20:09:05 +0200 Subject: [PATCH 1/2] Document the behavior and codepoints in valid_unicode --- src/wp-includes/kses.php | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/kses.php b/src/wp-includes/kses.php index ebb4a761b1150..6ca1760dff35e 100644 --- a/src/wp-includes/kses.php +++ b/src/wp-includes/kses.php @@ -2083,18 +2083,39 @@ function wp_kses_normalize_entities3( $matches ) { /** * Determines if a Unicode codepoint is valid. * + * The definition of a valid Unicode codepoint is taken from the XML definition: + * + * > Characters + * > + * > … + * > Legal characters are tab, carriage return, line feed, and the legal characters of + * > Unicode and ISO/IEC 10646. + * > … + * > Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + * * @since 2.7.0 * + * @see https://www.w3.org/TR/xml/#charsets + * * @param int $i Unicode codepoint. * @return bool Whether or not the codepoint is a valid Unicode codepoint. */ function valid_unicode( $i ) { $i = (int) $i; - return ( 0x9 === $i || 0xa === $i || 0xd === $i || - ( 0x20 <= $i && $i <= 0xd7ff ) || - ( 0xe000 <= $i && $i <= 0xfffd ) || - ( 0x10000 <= $i && $i <= 0x10ffff ) + return ( + 0x9 === $i || // U+0009 HORIZONTAL TABULATION (HT) + 0xA === $i || // U+000A LINE FEED (LF) + 0xD === $i || // U+000D CARRIAGE RETURN (CR) + /* + * The valid Unicode characters according to the + * {@link https://www.w3.org/TR/xml/#charsets XML specification}. + * + * > any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. + */ + ( 0x20 <= $i && $i <= 0xD7FF ) || + ( 0xE000 <= $i && $i <= 0xFFFD ) || + ( 0x10000 <= $i && $i <= 0x10FFFF ) ); } From 312bbbae79fbea5ab88f8c0a67d0eb240ae8dacc Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Sat, 28 Jun 2025 12:43:44 +0200 Subject: [PATCH 2/2] Remove link from XML unicode character comment --- src/wp-includes/kses.php | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/wp-includes/kses.php b/src/wp-includes/kses.php index 6ca1760dff35e..28bbce222a214 100644 --- a/src/wp-includes/kses.php +++ b/src/wp-includes/kses.php @@ -2108,8 +2108,7 @@ function valid_unicode( $i ) { 0xA === $i || // U+000A LINE FEED (LF) 0xD === $i || // U+000D CARRIAGE RETURN (CR) /* - * The valid Unicode characters according to the - * {@link https://www.w3.org/TR/xml/#charsets XML specification}. + * The valid Unicode characters according to the XML specification: * * > any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */