From 84dd3ac7d2d5e9d9fc87442d5c58d868428ad762 Mon Sep 17 00:00:00 2001 From: Maximilian Falco Widjaya Date: Wed, 3 Dec 2025 00:37:54 +1100 Subject: [PATCH 1/5] refactor regex to prevent ReDos --- .../transform/preprocess-jsx-expressions.ts | 116 +++++++++++++----- 1 file changed, 86 insertions(+), 30 deletions(-) diff --git a/processor/transform/preprocess-jsx-expressions.ts b/processor/transform/preprocess-jsx-expressions.ts index d7a67b835..18564c826 100644 --- a/processor/transform/preprocess-jsx-expressions.ts +++ b/processor/transform/preprocess-jsx-expressions.ts @@ -46,8 +46,9 @@ function evaluateExpression(expression: string, context: JSXContext): unknown { // Base64 encode HTMLBlock content to prevent parser from consuming `}'; + * protectHTMLBlockContent(input) + * // Returns: '' + * ``` + * @example + * ```typescript + * const input = '{`console.log("hello");`}'; + * protectHTMLBlockContent(input) + * // Returns: '' + * ``` + */ function protectHTMLBlockContent(content: string): string { - // each char matches exactly one way, preventing backtracking return content.replace( /(]*>)\{\s*`((?:[^`\\]|\\.)*)`\s*\}(<\/HTMLBlock>)/g, (_match, openTag: string, templateContent: string, closeTag: string) => { @@ -56,7 +101,45 @@ function protectHTMLBlockContent(content: string): string { ); } -// Protect code blocks and inline code from processing +/** + * Protects code blocks and inline code from JSX processing. + * + * Replaces fenced code blocks (```code block```) and inline code (`inline code`) with placeholders + * so they aren't affected by expression evaluation or other JSX processing steps. + * The original code is stored in arrays for later restoration. + * + * Process: + * 1. Find all fenced code blocks (```code block```) and replace with placeholders + * 2. Find all inline code (`inline code`) and replace with placeholders + * 3. Store originals in arrays for later restoration + * + * @param content - The markdown content to protect + * @returns Object containing protected content and arrays of original code blocks + * @example + * ```typescript + * const input = 'Text with `inline code` and ```fenced block```'; + * protectCodeBlocks(input) + * // Returns: { + * // protectedCode: { + * // codeBlocks: ['```fenced block```'], + * // inlineCode: ['`inline code`'] + * // }, + * // protectedContent: 'Text with ___INLINE_CODE_0___ and ___CODE_BLOCK_0___' + * // } + * ``` + * @example + * ```typescript + * const input = '```js\nconst x = {value: 1};\n```'; + * protectCodeBlocks(input) + * // Returns: { + * // protectedCode: { + * // codeBlocks: ['```js\nconst x = {value: 1};\n```'], + * // inlineCode: [] + * // }, + * // protectedContent: '___CODE_BLOCK_0___' + * // } + * ``` + */ function protectCodeBlocks(content: string): ProtectCodeBlocksResult { const codeBlocks: string[] = []; const inlineCode: string[] = []; @@ -69,10 +152,8 @@ function protectCodeBlocks(content: string): ProtectCodeBlocksResult { protectedContent += remaining.slice(0, codeBlockStart); remaining = remaining.slice(codeBlockStart); - // Find the closing ``` const codeBlockEnd = remaining.indexOf('```', 3); if (codeBlockEnd === -1) { - // No closing ```, keep the rest as-is break; } @@ -95,12 +176,62 @@ function protectCodeBlocks(content: string): ProtectCodeBlocksResult { return { protectedCode: { codeBlocks, inlineCode }, protectedContent }; } +/** + * Removes JSX-style comments from content. + * + * JSX comments are wrapped in braces with C-style comment syntax. + * Format: opening brace, optional whitespace, slash-asterisk, comment content, asterisk-slash, optional whitespace, closing brace. + * These comments would confuse the markdown parser, so they're removed before processing. + * + * The regex matches: + * - Opening brace with optional whitespace + * - Comment start marker (slash-asterisk) + * - Comment content (handling asterisks that don't close the comment) + * - Comment end marker (asterisk-slash) + * - Optional whitespace and closing brace + * + * @param content - Content potentially containing JSX comments + * @returns Content with JSX comments removed + * @example + * Input: 'Text { /* comment *\/ } more text' + * Output: 'Text more text' + * @example + * Input: '{ /* comment *\/ }' + * Output: '' + */ function removeJSXComments(content: string): string { - // This matches: any non-* chars, then (* followed by non-/ followed by non-* chars) repeated return content.replace(/\{\s*\/\*[^*]*(?:\*(?!\/)[^*]*)*\*\/\s*\}/g, ''); } -// Returns content between balanced braces and end position, or null if unbalanced +/** + * Extracts content between balanced braces starting at a given position. + * + * Tracks brace depth to handle nested braces correctly. Starts at depth 1 since + * the opening brace is already consumed. Returns the content between braces + * (excluding the braces themselves) and the position after the closing brace. + * + * @param content - The string to search in + * @param start - Starting position (should be after the opening {) + * @returns Object with extracted content and end position, or null if braces are unbalanced + * @example + * ```typescript + * const input = 'foo{bar{baz}qux}end'; + * extractBalancedBraces(input, 3) // start at position 3 (after '{') + * // Returns: { content: 'bar{baz}qux', end: 16 } + * ``` + * @example + * ```typescript + * const input = 'attr={value}'; + * extractBalancedBraces(input, 6) // start at position 6 (after '{') + * // Returns: { content: 'value', end: 12 } + * ``` + * @example + * ```typescript + * const input = 'unbalanced{'; + * extractBalancedBraces(input, 10) + * // Returns: null (unbalanced braces) + * ``` + */ function extractBalancedBraces(content: string, start: number): { content: string; end: number } | null { let depth = 1; let pos = start; @@ -116,9 +247,59 @@ function extractBalancedBraces(content: string, start: number): { content: strin return { content: content.slice(start, pos - 1), end: pos }; } -// Evaluate attribute expressions: attribute={expression} → attribute="value" +/** + * Evaluates JSX attribute expressions and converts them to HTML attributes. + * + * Transforms JSX attribute syntax (attribute={expression}) to HTML attributes (attribute="value"). + * The expression is evaluated using the provided context, and the result is converted to + * a string value for the HTML attribute. + * + * Special handling: + * - `style` objects are converted to CSS strings (camelCase → kebab-case) + * - `className` is converted to `class` (HTML standard) + * - Objects are JSON stringified + * - If evaluation fails, the original expression is kept unchanged + * + * @param content - Content containing JSX attribute expressions + * @param context - Context object for expression evaluation + * @returns Content with attribute expressions evaluated and converted to HTML attributes + * @example + * ```typescript + * const context = { baseUrl: 'https://example.com' }; + * const input = 'Link'; + * evaluateAttributeExpressions(input, context) + * // Returns: 'Link' + * ``` + * @example + * ```typescript + * const context = { isActive: true }; + * const input = '
Content
'; + * evaluateAttributeExpressions(input, context) + * // Returns: '
Content
' + * ``` + * @example + * ```typescript + * const context = { styles: { backgroundColor: 'red', fontSize: '14px' } }; + * const input = '
Content
'; + * evaluateAttributeExpressions(input, context) + * // Returns: '
Content
' + * ``` + * @example + * ```typescript + * const context = { className: 'my-class' }; + * const input = '
Content
'; + * evaluateAttributeExpressions(input, context) + * // Returns: '
Content
' + * ``` + * @example + * ```typescript + * const context = { data: { id: 1, name: 'test' } }; + * const input = '
Content
'; + * evaluateAttributeExpressions(input, context) + * // Returns: '
Content
' + * ``` + */ function evaluateAttributeExpressions(content: string, context: JSXContext): string { - // Match attribute names followed by ={ const attrStartRegex = /(\w+)=\{/g; let result = ''; let lastEnd = 0; @@ -168,6 +349,26 @@ function evaluateAttributeExpressions(content: string, context: JSXContext): str return result; } +/** + * Restores code blocks and inline code that were protected earlier. + * + * Replaces placeholders (___CODE_BLOCK_N___ and ___INLINE_CODE_N___) with the + * original code content that was stored during the protection phase. + * + * @param content - Content with code block placeholders + * @param protectedCode - Object containing arrays of original code blocks and inline code + * @returns Content with all code blocks and inline code restored + * @example + * ```typescript + * const content = 'Text with ___INLINE_CODE_0___ and ___CODE_BLOCK_0___'; + * const protectedCode = { + * codeBlocks: ['```js\ncode\n```'], + * inlineCode: ['`inline`'] + * }; + * restoreCodeBlocks(content, protectedCode) + * // Returns: 'Text with `inline` and ```js\ncode\n```' + * ``` + */ function restoreCodeBlocks(content: string, protectedCode: ProtectedCode): string { let restored = content.replace(/___CODE_BLOCK_(\d+)___/g, (_match, index: string) => { return protectedCode.codeBlocks[parseInt(index, 10)]; @@ -180,8 +381,57 @@ function restoreCodeBlocks(content: string, protectedCode: ProtectedCode): strin return restored; } -// We cant rely on remarkMdx since it restricts the syntax a lot -// so we have to try as much as possible to parse JSX syntax manually +/** + * Main preprocessing function for JSX-like expressions in markdown. + * + * We can't rely on remarkMdx since it restricts the syntax too much, so we manually + * parse and process JSX syntax before the markdown parser runs. + * + * Processing pipeline (executed in order): + * 1. Protect HTMLBlock content (base64 encode to prevent parser from consuming `}'; * protectHTMLBlockContent(input) * // Returns: '' * ``` - * @example - * ```typescript - * const input = '{`console.log("hello");`}'; - * protectHTMLBlockContent(input) - * // Returns: '' - * ``` */ function protectHTMLBlockContent(content: string): string { return content.replace( @@ -102,18 +79,9 @@ function protectHTMLBlockContent(content: string): string { } /** - * Protects code blocks and inline code from JSX processing. - * - * Replaces fenced code blocks (```code block```) and inline code (`inline code`) with placeholders - * so they aren't affected by expression evaluation or other JSX processing steps. - * The original code is stored in arrays for later restoration. - * - * Process: - * 1. Find all fenced code blocks (```code block```) and replace with placeholders - * 2. Find all inline code (`inline code`) and replace with placeholders - * 3. Store originals in arrays for later restoration + * Replaces code blocks and inline code with placeholders to protect them from JSX processing. * - * @param content - The markdown content to protect + * @param content * @returns Object containing protected content and arrays of original code blocks * @example * ```typescript @@ -127,18 +95,6 @@ function protectHTMLBlockContent(content: string): string { * // protectedContent: 'Text with ___INLINE_CODE_0___ and ___CODE_BLOCK_0___' * // } * ``` - * @example - * ```typescript - * const input = '```js\nconst x = {value: 1};\n```'; - * protectCodeBlocks(input) - * // Returns: { - * // protectedCode: { - * // codeBlocks: ['```js\nconst x = {value: 1};\n```'], - * // inlineCode: [] - * // }, - * // protectedContent: '___CODE_BLOCK_0___' - * // } - * ``` */ function protectCodeBlocks(content: string): ProtectCodeBlocksResult { const codeBlocks: string[] = []; @@ -177,41 +133,25 @@ function protectCodeBlocks(content: string): ProtectCodeBlocksResult { } /** - * Removes JSX-style comments from content. + * Removes JSX-style comments (e.g., { /* comment *\/ }) from content. * - * JSX comments are wrapped in braces with C-style comment syntax. - * Format: opening brace, optional whitespace, slash-asterisk, comment content, asterisk-slash, optional whitespace, closing brace. - * These comments would confuse the markdown parser, so they're removed before processing. - * - * The regex matches: - * - Opening brace with optional whitespace - * - Comment start marker (slash-asterisk) - * - Comment content (handling asterisks that don't close the comment) - * - Comment end marker (asterisk-slash) - * - Optional whitespace and closing brace - * - * @param content - Content potentially containing JSX comments + * @param content * @returns Content with JSX comments removed * @example - * Input: 'Text { /* comment *\/ } more text' - * Output: 'Text more text' - * @example - * Input: '{ /* comment *\/ }' - * Output: '' + * ```typescript + * removeJSXComments('Text { /* comment *\/ } more text') + * // Returns: 'Text more text' + * ``` */ function removeJSXComments(content: string): string { return content.replace(/\{\s*\/\*[^*]*(?:\*(?!\/)[^*]*)*\*\/\s*\}/g, ''); } /** - * Extracts content between balanced braces starting at a given position. - * - * Tracks brace depth to handle nested braces correctly. Starts at depth 1 since - * the opening brace is already consumed. Returns the content between braces - * (excluding the braces themselves) and the position after the closing brace. + * Extracts content between balanced braces, handling nested braces. * - * @param content - The string to search in - * @param start - Starting position (should be after the opening {) + * @param content + * @param start * @returns Object with extracted content and end position, or null if braces are unbalanced * @example * ```typescript @@ -219,18 +159,6 @@ function removeJSXComments(content: string): string { * extractBalancedBraces(input, 3) // start at position 3 (after '{') * // Returns: { content: 'bar{baz}qux', end: 16 } * ``` - * @example - * ```typescript - * const input = 'attr={value}'; - * extractBalancedBraces(input, 6) // start at position 6 (after '{') - * // Returns: { content: 'value', end: 12 } - * ``` - * @example - * ```typescript - * const input = 'unbalanced{'; - * extractBalancedBraces(input, 10) - * // Returns: null (unbalanced braces) - * ``` */ function extractBalancedBraces(content: string, start: number): { content: string; end: number } | null { let depth = 1; @@ -248,20 +176,11 @@ function extractBalancedBraces(content: string, start: number): { content: strin } /** - * Evaluates JSX attribute expressions and converts them to HTML attributes. - * - * Transforms JSX attribute syntax (attribute={expression}) to HTML attributes (attribute="value"). - * The expression is evaluated using the provided context, and the result is converted to - * a string value for the HTML attribute. + * Converts JSX attribute expressions (attribute={expression}) to HTML attributes (attribute="value"). + * Handles style objects (camelCase → kebab-case), className → class, and JSON stringifies objects. * - * Special handling: - * - `style` objects are converted to CSS strings (camelCase → kebab-case) - * - `className` is converted to `class` (HTML standard) - * - Objects are JSON stringified - * - If evaluation fails, the original expression is kept unchanged - * - * @param content - Content containing JSX attribute expressions - * @param context - Context object for expression evaluation + * @param content + * @param context * @returns Content with attribute expressions evaluated and converted to HTML attributes * @example * ```typescript @@ -270,34 +189,6 @@ function extractBalancedBraces(content: string, start: number): { content: strin * evaluateAttributeExpressions(input, context) * // Returns: 'Link' * ``` - * @example - * ```typescript - * const context = { isActive: true }; - * const input = '
Content
'; - * evaluateAttributeExpressions(input, context) - * // Returns: '
Content
' - * ``` - * @example - * ```typescript - * const context = { styles: { backgroundColor: 'red', fontSize: '14px' } }; - * const input = '
Content
'; - * evaluateAttributeExpressions(input, context) - * // Returns: '
Content
' - * ``` - * @example - * ```typescript - * const context = { className: 'my-class' }; - * const input = '
Content
'; - * evaluateAttributeExpressions(input, context) - * // Returns: '
Content
' - * ``` - * @example - * ```typescript - * const context = { data: { id: 1, name: 'test' } }; - * const input = '
Content
'; - * evaluateAttributeExpressions(input, context) - * // Returns: '
Content
' - * ``` */ function evaluateAttributeExpressions(content: string, context: JSXContext): string { const attrStartRegex = /(\w+)=\{/g; @@ -350,13 +241,10 @@ function evaluateAttributeExpressions(content: string, context: JSXContext): str } /** - * Restores code blocks and inline code that were protected earlier. - * - * Replaces placeholders (___CODE_BLOCK_N___ and ___INLINE_CODE_N___) with the - * original code content that was stored during the protection phase. + * Restores code blocks and inline code by replacing placeholders with original content. * - * @param content - Content with code block placeholders - * @param protectedCode - Object containing arrays of original code blocks and inline code + * @param content + * @param protectedCode * @returns Content with all code blocks and inline code restored * @example * ```typescript @@ -382,55 +270,12 @@ function restoreCodeBlocks(content: string, protectedCode: ProtectedCode): strin } /** - * Main preprocessing function for JSX-like expressions in markdown. + * Preprocesses JSX-like expressions in markdown before parsing. + * Inline expressions are handled separately; attribute expressions are processed here. * - * We can't rely on remarkMdx since it restricts the syntax too much, so we manually - * parse and process JSX syntax before the markdown parser runs. - * - * Processing pipeline (executed in order): - * 1. Protect HTMLBlock content (base64 encode to prevent parser from consuming