From 2696b6d864c9befcf3d64ad75dc3e4b6900e4a4b Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 11 Apr 2024 00:29:53 +0200 Subject: [PATCH 1/4] Block Parser: Start building a unified block/HTML parser. Many blocks are starting to be modified with the HTML API. Given that the HTML API will already be parsing block content, and also that it reads HTML comments along the way, it might make sense to combine the multiple passes of block and HTML parsing into one. Combining the parsers could: - introduce new _before_ and _after_ hooks where blocks can be inserted. this could introduce a new block hooks mechanism which doesn't require parsing and traversing the block structure, but can "tag along" with the normal process. - provide new insight and context into the block parsing, such as knowing where in the HTML a block is, or where, in a block tree the given HTML is. - improve efficiency by lazily-decoding the JSON attributes and by reusing the input string rather than breaking it up into many substring copies. - process Interactivity API directives as the document is parsed, linearly, and in a streaming fashion. this would avoid the need to run the Server Directive Processor over the entire document after it's already been parsed. - post-process block output to run global policies concerning attribute values and allowable HTML markup. It could slow some things down, however, which otherwise wouldn't require the block and HTML parsing. For example, the block parser is fast, if memory hungry. Parsing HTML along the way could slow it down. --- .../class-wp-unified-block-parser.php | 273 ++++++++++++++++++ src/wp-settings.php | 1 + 2 files changed, 274 insertions(+) create mode 100644 src/wp-includes/html-api/class-wp-unified-block-parser.php diff --git a/src/wp-includes/html-api/class-wp-unified-block-parser.php b/src/wp-includes/html-api/class-wp-unified-block-parser.php new file mode 100644 index 0000000000000..595f088c985f6 --- /dev/null +++ b/src/wp-includes/html-api/class-wp-unified-block-parser.php @@ -0,0 +1,273 @@ +json = $json; + $this->at = $at; + $this->length = $length; + } + + /** + * Parses and returns the value of the input JSON. + * + * @return mixed + */ + public function parse() { + if ( isset( $this->parsed_value ) ) { + return $this->parsed_value; + } + + $this->parsed_value = json_decode( $this->source() ); + return $this->parsed_value; + } + + public function source() { + $length = $this->length ?? strlen( $this->json ) - $this->at; + + return substr( $this->json, $this->at, $length ); + } +} + +class WP_Unified_Block_Parser { + /** @var WP_HTML_Processor */ + private $processor; + + /** @var array[] */ + private $blocks = array(); + + public function __construct( $html ) { + $this->processor = WP_HTML_Processor::create_fragment( $html ); + } + + public function step() { + if ( ! $this->processor->next_token() ) { + return false; + } + + if ( WP_HTML_Tag_Processor::COMMENT_AS_HTML_COMMENT === $this->processor->get_comment_type() ) { + $comment_text = $this->processor->get_modifiable_text(); + echo "\e[90mFound a \e[32mcomment\e[90m: \e[34m{$comment_text}\e[m\n"; + $block = self::parse_block_comment_text( $comment_text ); + if ( false !== $block ) { + echo " \e[90mand it was a block!\e[m\n"; + $json = isset( $block[3] ) ? $block[3]->source() : '(no attributes)'; + echo " \e[90m \e[31m{$block[0]} for \e[33m{$block[1]}\e[90m/\e[33m{$block[2]}\e[90m: \e[3;35m{$json}\e[m\n"; + return $block; + } + } + } + + public static function parse_block_comment_text( $text ) { + $at = 0; + $length = strlen( $text ); + + /* + * The minimum block comment is not that short. + * + * Example: + * + * + * └────┘ 6 characters. + */ + if ( $length < 6 ) { + return false; + } + + /* + * Skip whitespace. + * + * Example: + * + * + * └──┘ + */ + $at += strspn( $text, " \t\r\n\f", $at ); + if ( $at >= $length ) { + return false; + } + + /* + * Is this a block closer? + * + * Example: + * + * + * ^ + */ + $is_closer = '/' === $text[ $at ]; + if ( $is_closer ) { + ++$at; + } + + /* + * Is this a void block? + * + * Example: + * + * + * ^ + * + * The self-closing flag takes precedence over + * the closing flag, so the following would be + * considered a void tag. + * + * Example: + * + * + */ + $is_void = '/' === $text[ $length - 1 ]; + + $delimiter_type = $is_void ? 'void' : ( $is_closer ? 'closer' : 'opener' ); + + /* + * Does this have the block comment start? + * + * Example: + * + * + * └─┘ + */ + if ( 0 !== substr_compare( $text, 'wp:', $at, 3 ) ) { + return false; + } + + /* + * Determine block name portion, which _must_ be followed by whitespace. + * + * Example: + * + * + * └───────┘ + */ + $name_length = strcspn( $text, " \t\r\n\f", $at ); + if ( 0 === $name_length ) { + return false; + } + + /* + * Determine if the block name contains a namespace or is + * implicitly the "core/" namespace because none is present. + * + * Example: + * + * + * ^ + */ + $slash_offset = strcspn( $text, '/', $at ); + if ( 0 === $slash_offset || $name_length === $slash_offset ) { + return false; + } + + $has_namespace = $slash_offset === $name_length; + + /* + * Separate the namespace from the block name, if a namespace is present. + * + * Example: + * + * + * └──┘ └───────┘ + */ + $namespace = $has_namespace + ? substr( $text, $at, $slash_offset ) + : 'core'; + + $block_name = $has_namespace + ? substr( $text, $at + $slash_offset, $name_length - $slash_offset ) + : substr( $text, $at, $name_length ); + + $at += $name_length; + + /* + * Validate the namespace and block name. + */ + $name_pattern = '~[a-z][a-z0-9_-]*~'; + if ( + 1 !== preg_match( $name_pattern, $namespace ) || + 1 !== preg_match( $name_pattern, $block_name ) + ) { + return false; + } + + /* + * Skip whitespace, which _must_ follow regardless of whether + * there are JSON block attributes. + * + * Example: + * + * + * └─┘ + * + * + * └─┘ + */ + $at += strspn( $text, " \t\r\n\f", $at ); + + // If this ends the comment, then there are no attributes. + if ( $at >= $length ) { + return array( $delimiter_type, $namespace, $block_name, null ); + } + + /* + * Find the JSON attributes; these are the only things allowed + * after this point other than the void block indicator. + * + * Example: + * + * + * ^ + */ + if ( '{' !== $text[ $at ] ) { + return false; + } + + /* + * Ensure there's whitespace after the potential JSON attributes. + * This could appear at the end, or if it's a void tag, immediately before it. + */ + if ( ! str_contains( " \t\r\n\f", $text[ $length - ( $is_void ? 2 : 1 ) ] ) ) { + return false; + } + + $json_region = substr( $text, $at, $length - $at - ( $is_void ? 1 : 0 ) ); + $json_region = trim( $json_region, " \t\r\n\f" ); + + if ( '}' !== $json_region[ strlen( $json_region ) - 1 ] ) { + return false; + } + + /* + * @todo Should the JSON be validated here? If it fails, should the delimiter + * be rejected or should it only return broken attributes? By avoiding + * the parse for now it can defer the parsing costs until they are read. + */ + $attributes = $is_closer ? null : new WP_Lazy_JSON_Object( $json_region ); + return array( $delimiter_type, $namespace, $block_name, $attributes ); + } +} diff --git a/src/wp-settings.php b/src/wp-settings.php index 9673479bfab76..5cdab40e8e82a 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -258,6 +258,7 @@ require ABSPATH . WPINC . '/html-api/class-wp-html-token.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-processor-state.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-processor.php'; +require ABSPATH . WPINC . '/html-api/class-wp-unified-block-parser.php'; require ABSPATH . WPINC . '/class-wp-http.php'; require ABSPATH . WPINC . '/class-wp-http-streams.php'; require ABSPATH . WPINC . '/class-wp-http-curl.php'; From bf5e15cd2807c14391c28449bd106587e5c7e175 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 11 Apr 2024 16:45:06 +0200 Subject: [PATCH 2/4] Nest blocks and track stack of open blocks. --- .../class-wp-unified-block-parser.php | 245 ++++++++++++++++-- 1 file changed, 225 insertions(+), 20 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-unified-block-parser.php b/src/wp-includes/html-api/class-wp-unified-block-parser.php index 595f088c985f6..3de1fad94cf85 100644 --- a/src/wp-includes/html-api/class-wp-unified-block-parser.php +++ b/src/wp-includes/html-api/class-wp-unified-block-parser.php @@ -54,35 +54,240 @@ public function source() { } } +class WP_Lazy_Parsed_Block { + /** + * The block's namespace. + * + * @var string + */ + public $namespace; + + /** + * The block's name. + * + * @var string + */ + public $block_name; + + /** + * Whether the block is void. + * + * @var bool + */ + public $is_void; + + /** + * The block's JSON attributes, if available, lazily evaluated. + * + * @var ?WP_Lazy_JSON_Object + */ + public $attributes; + + /** + * The block's inner blocks. + * + * @var WP_Lazy_Parsed_Block[]. + */ + public $inner_blocks = array(); + + /** + * Bookmark for block opener. + * + * @var string + */ + private $block_opener_at; + + /** + * Bookmark for block closer. + * + * @var string + */ + private $block_closer_at; + + public function __construct( $namespace, $block_name, $attributes, $opener_at, $is_void ) { + $this->namespace = $namespace; + $this->block_name = $block_name; + $this->attributes = $attributes; + $this->block_opener_at = $opener_at; + $this->is_void = $is_void; + if ( $is_void ) { + $this->block_closer_at = $opener_at; + } + } + + public function end_at( $bookmark_name ) { + $this->block_closer_at = $bookmark_name; + } +} + +class WP_Parsed_Block_Comment { + /** + * The block's namespace. + * + * @var string + */ + public $namespace; + + /** + * The block's name. + * + * @var string + */ + public $block_name; + + /** + * The block's attributes, if an opener. + * + * @var ?WP_Lazy_JSON_Object + */ + public $attributes; + + /** + * What kind of comment delimiter this is. + * + * @var string One of "opener" "closer" or "void". + */ + public $type; + + public function __construct( $type, $namespace, $block_name, $attributes ) { + $this->type = $type; + $this->namespace = $namespace; + $this->block_name = $block_name; + if ( self::CLOSER !== $type ) { + $this->attributes = $attributes; + } + } + + const OPENER = 'opener'; + + const CLOSER = 'closer'; + + const VOID = 'void'; +} + class WP_Unified_Block_Parser { /** @var WP_HTML_Processor */ private $processor; - /** @var array[] */ + /** @var WP_Lazy_Parsed_Block[] */ private $blocks = array(); + private $block_count = 0; + public function __construct( $html ) { $this->processor = WP_HTML_Processor::create_fragment( $html ); } + /** + * @param WP_Lazy_Parsed_Block $block + */ + private function open_block( $block ) { + if ( $block->is_void ) { + echo "\e[90mFound a \e[32mvoid\e[90m block of type \e[34m{$block->namespace}\e[90m/\e[34m{$block->block_name}\e[90m with "; + if ( isset( $block->attributes ) ) { + echo "\e[3;35m{$block->attributes->source()}\e[m\n"; + } else { + echo "\e[3mno attributes\e[m\n"; + } + + return; + } + + echo "\e[90mOpening a block of type \e[34m{$block->namespace}\e[90m/\e[34m{$block->block_name}\e[90m with "; + if ( isset( $block->attributes ) ) { + echo "\e[3;35m{$block->attributes->source()}\e[m\n"; + } else { + echo "\e[3mno attributes\e[m\n"; + } + } + + /** + * @param WP_Lazy_Parsed_Block $block + */ + private function close_block( $block ) { + echo "\e[90mClosing block of type \e[34m{$block->namespace}\e[90m/\e[34m{$block->block_name}\e[m\n"; + if ( count( $block->inner_blocks ) > 0 ) { + echo " \e[90mit contained inner blocks:\n"; + foreach ( $block->inner_blocks as $inner_block ) { + echo "\e[90m - \e[34m{$inner_block->namespace}\e[90m/\e[34m{$inner_block->block_name}"; + if ( isset( $inner_block->inner_blocks ) && count( $inner_block->inner_blocks ) > 0 ) { + echo "\e[90m which itself contained \e[33m" . count( $inner_block->inner_blocks ) . "\e[90m inner blocks"; + } + echo "\e[m\n"; + } + } + } + public function step() { if ( ! $this->processor->next_token() ) { return false; } if ( WP_HTML_Tag_Processor::COMMENT_AS_HTML_COMMENT === $this->processor->get_comment_type() ) { - $comment_text = $this->processor->get_modifiable_text(); - echo "\e[90mFound a \e[32mcomment\e[90m: \e[34m{$comment_text}\e[m\n"; - $block = self::parse_block_comment_text( $comment_text ); - if ( false !== $block ) { - echo " \e[90mand it was a block!\e[m\n"; - $json = isset( $block[3] ) ? $block[3]->source() : '(no attributes)'; - echo " \e[90m \e[31m{$block[0]} for \e[33m{$block[1]}\e[90m/\e[33m{$block[2]}\e[90m: \e[3;35m{$json}\e[m\n"; - return $block; + $comment_text = $this->processor->get_modifiable_text(); + $block_comment = self::parse_block_comment_text( $comment_text ); + + if ( isset( $block_comment ) ) { + switch ( $block_comment->type ) { + case WP_Parsed_Block_Comment::OPENER: + case WP_Parsed_Block_Comment::VOID: + ++$this->block_count; + $bookmark = "block-{$this->block_count}"; + $this->processor->set_bookmark( $bookmark ); + + $block = new WP_Lazy_Parsed_Block( + $block_comment->namespace, + $block_comment->block_name, + $block_comment->attributes, + $bookmark, + WP_Parsed_Block_Comment::VOID === $block_comment->type + ); + + $open_block = end( $this->blocks ); + if ( false !== $open_block ) { + $open_block->inner_blocks[] = $block; + } + + $this->blocks[] = $block; + $this->open_block( $block ); + break; + + case WP_Parsed_Block_Comment::CLOSER: + // Ignore closers if there are no openers. + if ( 0 === count( $this->blocks ) ) { + break; + } + + // Ignore also if it's not the associated closer for the most-recently opened block. + $opener = end( $this->blocks ); + if ( $opener->namespace !== $block_comment->namespace || $opener->block_name !== $block_comment->block_name ) { + break; + } + + ++$this->block_count; + $bookmark = "block-{$this->block_count}"; + $this->processor->set_bookmark( $bookmark ); + $opener->end_at( $bookmark ); + + array_pop( $this->blocks ); + $this->close_block( $opener ); + + break; + } + return $block_comment; } } } + /** + * Parses a comment's modifiable text to determine if it represents + * a valid block comment delimiter, and if so, returns the block meta. + * + * @since {WP_VERSION} + * + * @param string $text Modifiable text for an HTML comment to parse. + * @return WP_Parsed_Block_Comment|null Parsed block comment delimiter, if possible, otherwise null. + */ public static function parse_block_comment_text( $text ) { $at = 0; $length = strlen( $text ); @@ -96,7 +301,7 @@ public static function parse_block_comment_text( $text ) { * └────┘ 6 characters. */ if ( $length < 6 ) { - return false; + return null; } /* @@ -109,7 +314,7 @@ public static function parse_block_comment_text( $text ) { */ $at += strspn( $text, " \t\r\n\f", $at ); if ( $at >= $length ) { - return false; + return null; } /* @@ -154,7 +359,7 @@ public static function parse_block_comment_text( $text ) { * └─┘ */ if ( 0 !== substr_compare( $text, 'wp:', $at, 3 ) ) { - return false; + return null; } /* @@ -167,7 +372,7 @@ public static function parse_block_comment_text( $text ) { */ $name_length = strcspn( $text, " \t\r\n\f", $at ); if ( 0 === $name_length ) { - return false; + return null; } /* @@ -181,7 +386,7 @@ public static function parse_block_comment_text( $text ) { */ $slash_offset = strcspn( $text, '/', $at ); if ( 0 === $slash_offset || $name_length === $slash_offset ) { - return false; + return null; } $has_namespace = $slash_offset === $name_length; @@ -212,7 +417,7 @@ public static function parse_block_comment_text( $text ) { 1 !== preg_match( $name_pattern, $namespace ) || 1 !== preg_match( $name_pattern, $block_name ) ) { - return false; + return null; } /* @@ -231,7 +436,7 @@ public static function parse_block_comment_text( $text ) { // If this ends the comment, then there are no attributes. if ( $at >= $length ) { - return array( $delimiter_type, $namespace, $block_name, null ); + return new WP_Parsed_Block_Comment( $delimiter_type, $namespace, $block_name, null ); } /* @@ -244,7 +449,7 @@ public static function parse_block_comment_text( $text ) { * ^ */ if ( '{' !== $text[ $at ] ) { - return false; + return null; } /* @@ -252,14 +457,14 @@ public static function parse_block_comment_text( $text ) { * This could appear at the end, or if it's a void tag, immediately before it. */ if ( ! str_contains( " \t\r\n\f", $text[ $length - ( $is_void ? 2 : 1 ) ] ) ) { - return false; + return null; } $json_region = substr( $text, $at, $length - $at - ( $is_void ? 1 : 0 ) ); $json_region = trim( $json_region, " \t\r\n\f" ); if ( '}' !== $json_region[ strlen( $json_region ) - 1 ] ) { - return false; + return null; } /* @@ -268,6 +473,6 @@ public static function parse_block_comment_text( $text ) { * the parse for now it can defer the parsing costs until they are read. */ $attributes = $is_closer ? null : new WP_Lazy_JSON_Object( $json_region ); - return array( $delimiter_type, $namespace, $block_name, $attributes ); + return new WP_Parsed_Block_Comment( $delimiter_type, $namespace, $block_name, $attributes ); } } From ce15db1c218b88b92af6667cb39278a1367dd4a9 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sun, 14 Apr 2024 12:30:42 +0200 Subject: [PATCH 3/4] Add get_depth() and example code for block comment parser --- .../html-api/class-wp-unified-block-parser.php | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-unified-block-parser.php b/src/wp-includes/html-api/class-wp-unified-block-parser.php index 3de1fad94cf85..49af2301329d7 100644 --- a/src/wp-includes/html-api/class-wp-unified-block-parser.php +++ b/src/wp-includes/html-api/class-wp-unified-block-parser.php @@ -218,6 +218,10 @@ private function close_block( $block ) { } } + public function get_depth() { + return count( $this->blocks ); + } + public function step() { if ( ! $this->processor->next_token() ) { return false; @@ -283,6 +287,14 @@ public function step() { * Parses a comment's modifiable text to determine if it represents * a valid block comment delimiter, and if so, returns the block meta. * + * Example: + * + * $block = parse_block_comment_text( ' wp:paragraph {"dropCaps":true} ' ); + * $block === WP_Lazy_Parsed_Block( 'core', 'paragraph', WP_Lazy_JSON_Object( '{"dropCaps":true}' ) ); + * + * $block = parse_block_comment_text( '[IF[IE>6]]' ); + * $block === null; + * * @since {WP_VERSION} * * @param string $text Modifiable text for an HTML comment to parse. @@ -472,7 +484,7 @@ public static function parse_block_comment_text( $text ) { * be rejected or should it only return broken attributes? By avoiding * the parse for now it can defer the parsing costs until they are read. */ - $attributes = $is_closer ? null : new WP_Lazy_JSON_Object( $json_region ); + $attributes = $is_closer ? null : new WP_Lazy_JSON_Object( $text, $at, $length - $at - ( $is_void ? 1 : 0 ) ); return new WP_Parsed_Block_Comment( $delimiter_type, $namespace, $block_name, $attributes ); } } From b5ecc9bf3c05c415ddfb5b17e6644228d5187f73 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sun, 14 Apr 2024 12:40:22 +0200 Subject: [PATCH 4/4] Add comments --- .../class-wp-unified-block-parser.php | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-unified-block-parser.php b/src/wp-includes/html-api/class-wp-unified-block-parser.php index 49af2301329d7..0f3915745c3c2 100644 --- a/src/wp-includes/html-api/class-wp-unified-block-parser.php +++ b/src/wp-includes/html-api/class-wp-unified-block-parser.php @@ -1,5 +1,11 @@ ', $next_at + 4 ); + if ( false === $next_at ) { + return false; + } + + $block = self::parse_block_comment_text( $html, $next_at + 4, $closer_at - $next_at - 4 ); + } + /** * Parses a comment's modifiable text to determine if it represents * a valid block comment delimiter, and if so, returns the block meta.