From 6c3bec2c968d7ed1d7b5c61ddb9a5decd72b5582 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Apr 2019 14:33:10 +1000 Subject: [PATCH 01/22] Update spec to CommonMark 0.29, sync regression tests --- .../src/main/resources/cmark-regression.txt | 67 ++- .../resources/commonmark.js-regression.txt | 53 +- .../src/main/resources/spec.txt | 548 ++++++++++++++---- 3 files changed, 535 insertions(+), 133 deletions(-) diff --git a/commonmark-test-util/src/main/resources/cmark-regression.txt b/commonmark-test-util/src/main/resources/cmark-regression.txt index 2984a3bef..62b1e7efe 100644 --- a/commonmark-test-util/src/main/resources/cmark-regression.txt +++ b/commonmark-test-util/src/main/resources/cmark-regression.txt @@ -4,8 +4,7 @@ Issue #113: EOL character weirdness on Windows (Important: first line ends with CR + CR + LF) ```````````````````````````````` example -line1 - +line1 line2 .

line1

@@ -82,7 +81,8 @@ Issue #193 - unescaped left angle brackets in link destination [a]: . -

a

+

[a]

+

[a]: <te

```````````````````````````````` Issue #192 - escaped spaces in link destination @@ -93,3 +93,64 @@ Issue #192 - escaped spaces in link destination .

[a](te\ st)

```````````````````````````````` + +Issue #527 - meta tags in inline contexts + +```````````````````````````````` example +City: + + + +. +

City: + + +

+```````````````````````````````` + +Issue #530 - link parsing corner cases + +```````````````````````````````` example +[a](\ b) + +[a](<[a](\ b)

+

[a](<<b)

+

[a](<b +)

+```````````````````````````````` + +Issue commonmark#526 - unescaped ( in link title + +```````````````````````````````` example +[link](url ((title)) +. +

[link](url ((title))

+```````````````````````````````` + +Issue commonamrk#517 - script, pre, style close tag without +opener. + +```````````````````````````````` example + + + + + +. + + + +```````````````````````````````` + +Issue #289. + +```````````````````````````````` example +[a]( +. +

[a](<b) c>

+```````````````````````````````` diff --git a/commonmark-test-util/src/main/resources/commonmark.js-regression.txt b/commonmark-test-util/src/main/resources/commonmark.js-regression.txt index 7300952fe..a99620bb1 100644 --- a/commonmark-test-util/src/main/resources/commonmark.js-regression.txt +++ b/commonmark-test-util/src/main/resources/commonmark.js-regression.txt @@ -15,10 +15,10 @@ bar Type 7 HTML block followed by whitespace (#98). ```````````````````````````````` example - + x . - + x ```````````````````````````````` @@ -95,10 +95,55 @@ Issue #116 - tabs before and after ATX closing heading

foo

```````````````````````````````` -commonmark/CommonMark#493 - escaped space not allowed in link -destination. +commonmark/CommonMark#493 - escaped space not allowed in link destination. + ```````````````````````````````` example [link](a\ b) .

[link](a\ b)

```````````````````````````````` + +Issue #527 - meta tags in inline contexts + +```````````````````````````````` example +City: + + + +. +

City: + + +

+```````````````````````````````` + +Double-encoding. + +```````````````````````````````` example +[XSS](javascript&colon;alert%28'XSS'%29) +. +

XSS

+```````````````````````````````` + +Issue commonamrk#517 - script, pre, style close tag without +opener. + +```````````````````````````````` example + + + + + +. + + + +```````````````````````````````` + +Issue #289. + +```````````````````````````````` example +[a]( +. +

[a](<b) c>

+```````````````````````````````` diff --git a/commonmark-test-util/src/main/resources/spec.txt b/commonmark-test-util/src/main/resources/spec.txt index 9fd584139..3913de442 100644 --- a/commonmark-test-util/src/main/resources/spec.txt +++ b/commonmark-test-util/src/main/resources/spec.txt @@ -1,8 +1,8 @@ --- title: CommonMark Spec author: John MacFarlane -version: 0.28 -date: '2017-08-01' +version: 0.29 +date: '2019-04-06' license: '[CC-BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)' ... @@ -248,7 +248,7 @@ satisfactory replacement for a spec. Because there is no unambiguous spec, implementations have diverged considerably. As a result, users are often surprised to find that -a document that renders one way on one system (say, a github wiki) +a document that renders one way on one system (say, a GitHub wiki) renders differently on another (say, converting to docbook using pandoc). To make matters worse, because nothing in Markdown counts as a "syntax error," the divergence often isn't discovered right away. @@ -328,8 +328,10 @@ that is not a [whitespace character]. An [ASCII punctuation character](@) is `!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`, -`*`, `+`, `,`, `-`, `.`, `/`, `:`, `;`, `<`, `=`, `>`, `?`, `@`, -`[`, `\`, `]`, `^`, `_`, `` ` ``, `{`, `|`, `}`, or `~`. +`*`, `+`, `,`, `-`, `.`, `/` (U+0021–2F), +`:`, `;`, `<`, `=`, `>`, `?`, `@` (U+003A–0040), +`[`, `\`, `]`, `^`, `_`, `` ` `` (U+005B–0060), +`{`, `|`, `}`, or `~` (U+007B–007E). A [punctuation character](@) is an [ASCII punctuation character] or anything in @@ -514,8 +516,8 @@ one block element does not affect the inline parsing of any other. ## Container blocks and leaf blocks We can divide blocks into two types: -[container block](@)s, -which can contain other blocks, and [leaf block](@)s, +[container blocks](@), +which can contain other blocks, and [leaf blocks](@), which cannot. # Leaf blocks @@ -527,7 +529,7 @@ Markdown document. A line consisting of 0-3 spaces of indentation, followed by a sequence of three or more matching `-`, `_`, or `*` characters, each followed -optionally by any number of spaces, forms a +optionally by any number of spaces or tabs, forms a [thematic break](@). ```````````````````````````````` example @@ -825,7 +827,7 @@ Contents are parsed as inlines: ```````````````````````````````` -Leading and trailing blanks are ignored in parsing inline content: +Leading and trailing [whitespace] is ignored in parsing inline content: ```````````````````````````````` example # foo @@ -1024,6 +1026,20 @@ baz* baz ```````````````````````````````` +The contents are the result of parsing the headings's raw +content as inlines. The heading's raw content is formed by +concatenating the lines and removing initial and final +[whitespace]. + +```````````````````````````````` example + Foo *bar +baz*→ +==== +. +

Foo bar +baz

+```````````````````````````````` + The underlining can be any length: @@ -1584,8 +1600,8 @@ begins with a code fence, indented no more than three spaces. The line with the opening code fence may optionally contain some text following the code fence; this is trimmed of leading and trailing -spaces and called the [info string](@). -The [info string] may not contain any backtick +whitespace and called the [info string](@). If the [info string] comes +after a backtick fence, it may not contain any backtick characters. (The reason for this restriction is that otherwise some inline code would be incorrectly interpreted as the beginning of a fenced code block.) @@ -1870,7 +1886,7 @@ Code fences (opening and closing) cannot contain internal spaces: ``` ``` aaa . -

+

aaa

```````````````````````````````` @@ -1922,9 +1938,11 @@ bar An [info string] can be provided after the opening code fence. -Opening and closing spaces will be stripped, and the first word, prefixed -with `language-`, is used as the value for the `class` attribute of the -`code` element within the enclosing `pre` element. +Although this spec doesn't mandate any particular treatment of +the info string, the first word is typically used to specify +the language of the code block. In HTML output, the language is +normally indicated by adding a class to the `code` element consisting +of `language-` followed by the language name. ```````````````````````````````` example ```ruby @@ -1973,6 +1991,18 @@ foo

```````````````````````````````` +[Info strings] for tilde code blocks can contain backticks and tildes: + +```````````````````````````````` example +~~~ aa ``` ~~~ +foo +~~~ +. +
foo
+
+```````````````````````````````` + + Closing code fences cannot have [info strings]: ```````````````````````````````` example @@ -1991,14 +2021,15 @@ Closing code fences cannot have [info strings]: An [HTML block](@) is a group of lines that is treated as raw HTML (and will not be escaped in HTML output). -There are seven kinds of [HTML block], which can be defined -by their start and end conditions. The block begins with a line that -meets a [start condition](@) (after up to three spaces -optional indentation). It ends with the first subsequent line that -meets a matching [end condition](@), or the last line of -the document or other [container block]), if no line is encountered that meets the -[end condition]. If the first line meets both the [start condition] -and the [end condition], the block will contain just that line. +There are seven kinds of [HTML block], which can be defined by their +start and end conditions. The block begins with a line that meets a +[start condition](@) (after up to three spaces optional indentation). +It ends with the first subsequent line that meets a matching [end +condition](@), or the last line of the document, or the last line of +the [container block](#container-blocks) containing the current HTML +block, if no line is encountered that meets the [end condition]. If +the first line meets both the [start condition] and the [end +condition], the block will contain just that line. 1. **Start condition:** line begins with the string ``, or @@ -2037,16 +2068,17 @@ the string `/>`.\ **End condition:** line is followed by a [blank line]. 7. **Start condition:** line begins with a complete [open tag] -or [closing tag] (with any [tag name] other than `script`, -`style`, or `pre`) followed only by [whitespace] -or the end of the line.\ +(with any [tag name] other than `script`, +`style`, or `pre`) or a complete [closing tag], +followed only by [whitespace] or the end of the line.\ **End condition:** line is followed by a [blank line]. HTML blocks continue until they are closed by their appropriate -[end condition], or the last line of the document or other [container block]. -This means any HTML **within an HTML block** that might otherwise be recognised -as a start condition will be ignored by the parser and passed through as-is, -without changing the parser's state. +[end condition], or the last line of the document or other [container +block](#container-blocks). This means any HTML **within an HTML +block** that might otherwise be recognised as a start condition will +be ignored by the parser and passed through as-is, without changing +the parser's state. For instance, `
` within a HTML block started by `` will not affect
 the parser state; as the HTML block was started in by start condition 6, it
@@ -2069,7 +2101,7 @@ _world_.
 
```````````````````````````````` -In this case, the HTML block is terminated by the newline — the `**hello**` +In this case, the HTML block is terminated by the newline — the `**Hello**` text remains verbatim — and regular parsing resumes, with a paragraph, emphasised `world` and inline and block HTML following. @@ -2612,7 +2644,8 @@ bar However, a following blank line is needed, except at the end of -a document, and except for blocks of types 1--5, above: +a document, and except for blocks of types 1--5, [above][HTML +block]: ```````````````````````````````` example
@@ -2758,8 +2791,8 @@ an indented code block: Fortunately, blank lines are usually not necessary and can be deleted. The exception is inside `
` tags, but as described
-above, raw HTML blocks starting with `
` *can* contain blank
-lines.
+[above][HTML blocks], raw HTML blocks starting with `
`
+*can* contain blank lines.
 
 ## Link reference definitions
 
@@ -2811,7 +2844,7 @@ them.
 
 ```````````````````````````````` example
 [Foo bar]:
-
+
 'title'
 
 [Foo bar]
@@ -2877,6 +2910,29 @@ The link destination may not be omitted:
 

[foo]

```````````````````````````````` + However, an empty link destination may be specified using + angle brackets: + +```````````````````````````````` example +[foo]: <> + +[foo] +. +

foo

+```````````````````````````````` + +The title must be separated from the link destination by +whitespace: + +```````````````````````````````` example +[foo]: (baz) + +[foo] +. +

[foo]: (baz)

+

[foo]

+```````````````````````````````` + Both title and destination can contain backslash escapes and literal backslashes: @@ -3034,6 +3090,25 @@ and thematic breaks, and it need not be followed by a blank line. ```````````````````````````````` +```````````````````````````````` example +[foo]: /url +bar +=== +[foo] +. +

bar

+

foo

+```````````````````````````````` + +```````````````````````````````` example +[foo]: /url +=== +[foo] +. +

=== +foo

+```````````````````````````````` + Several [link reference definitions] can occur one after another, without intervening blank lines. @@ -3070,6 +3145,17 @@ are defined: ```````````````````````````````` +Whether something is a [link reference definition] is +independent of whether the link reference it defines is +used in the document. Thus, for example, the following +document contains just a link reference definition, and +no visible content: + +```````````````````````````````` example +[foo]: /url +. +```````````````````````````````` + ## Paragraphs @@ -3207,7 +3293,7 @@ aaa # Container blocks -A [container block] is a block that has other +A [container block](#container-blocks) is a block that has other blocks as its contents. There are two basic kinds of container blocks: [block quotes] and [list items]. [Lists] are meta-containers for [list items]. @@ -3669,9 +3755,8 @@ in some browsers.) The following rules define [list items]: 1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of - blocks *Bs* starting with a [non-whitespace character] and not separated - from each other by more than one blank line, and *M* is a list - marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces, then the result + blocks *Bs* starting with a [non-whitespace character], and *M* is a + list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces, then the result of prepending *M* and the following spaces to the first line of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a list item with *Bs* as its contents. The type of the list item @@ -3981,8 +4066,7 @@ A start number may not be negative: 2. **Item starting with indented code.** If a sequence of lines *Ls* constitute a sequence of blocks *Bs* starting with an indented code - block and not separated from each other by more than one blank line, - and *M* is a list marker of width *W* followed by + block, and *M* is a list marker of width *W* followed by one space, then the result of prepending *M* and the following space to the first line of *Ls*, and indenting subsequent lines of *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents. @@ -4458,9 +4542,10 @@ continued here.

6. **That's all.** Nothing that is not counted as a list item by rules #1--5 counts as a [list item](#list-items). -The rules for sublists follow from the general rules above. A sublist -must be indented the same number of spaces a paragraph would need to be -in order to be included in the list item. +The rules for sublists follow from the general rules +[above][List items]. A sublist must be indented the same number +of spaces a paragraph would need to be in order to be included +in the list item. So, in this case we need two spaces indent: @@ -5049,11 +5134,9 @@ item: - b - c - d - - e - - f - - g - - h -- i + - e + - f +- g .
  • a
  • @@ -5063,8 +5146,6 @@ item:
  • e
  • f
  • g
  • -
  • h
  • -
  • i
```````````````````````````````` @@ -5074,7 +5155,7 @@ item: 2. b - 3. c + 3. c .
  1. @@ -5089,6 +5170,49 @@ item:
```````````````````````````````` +Note, however, that list items may not be indented more than +three spaces. Here `- e` is treated as a paragraph continuation +line, because it is indented more than three spaces: + +```````````````````````````````` example +- a + - b + - c + - d + - e +. +
    +
  • a
  • +
  • b
  • +
  • c
  • +
  • d +- e
  • +
+```````````````````````````````` + +And here, `3. c` is treated as in indented code block, +because it is indented four spaces and preceded by a +blank line. + +```````````````````````````````` example +1. a + + 2. b + + 3. c +. +
    +
  1. +

    a

    +
  2. +
  3. +

    b

    +
  4. +
+
3. c
+
+```````````````````````````````` + This is a loose list, because there is a blank line between two of the list items: @@ -5378,10 +5502,10 @@ Thus, for example, in

hilo`

```````````````````````````````` - `hi` is parsed as code, leaving the backtick at the end as a literal backtick. + ## Backslash escapes Any ASCII punctuation character may be backslash-escaped: @@ -5415,6 +5539,7 @@ not have their usual Markdown meanings: \* not a list \# not a heading \[foo]: /url "not a reference" +\ö not a character entity .

*not emphasized* <br/> not a tag @@ -5423,7 +5548,8 @@ not have their usual Markdown meanings: 1. not a list * not a list # not a heading -[foo]: /url "not a reference"

+[foo]: /url "not a reference" +&ouml; not a character entity

```````````````````````````````` @@ -5521,13 +5647,23 @@ foo ## Entity and numeric character references -All valid HTML entity references and numeric character -references, except those occuring in code blocks and code spans, -are recognized as such and treated as equivalent to the -corresponding Unicode characters. Conforming CommonMark parsers -need not store information about whether a particular character -was represented in the source using a Unicode character or -an entity reference. +Valid HTML entity references and numeric character references +can be used in place of the corresponding Unicode character, +with the following exceptions: + +- Entity and character references are not recognized in code + blocks and code spans. + +- Entity and character references cannot stand in place of + special characters that define structural elements in + CommonMark. For example, although `*` can be used + in place of a literal `*` character, `*` cannot replace + `*` in emphasis delimiters, bullet list markers, or thematic + breaks. + +Conforming CommonMark parsers need not store information about +whether a particular character was represented in the source +using a Unicode character or an entity reference. [Entity references](@) consist of `&` + any of the valid HTML5 entity names + `;`. The @@ -5548,22 +5684,22 @@ references and their corresponding code points. [Decimal numeric character references](@) -consist of `&#` + a string of 1--8 arabic digits + `;`. A +consist of `&#` + a string of 1--7 arabic digits + `;`. A numeric character reference is parsed as the corresponding Unicode character. Invalid Unicode code points will be replaced by the REPLACEMENT CHARACTER (`U+FFFD`). For security reasons, the code point `U+0000` will also be replaced by `U+FFFD`. ```````````````````````````````` example -# Ӓ Ϡ � � +# Ӓ Ϡ � . -

# Ӓ Ϡ � �

+

# Ӓ Ϡ �

```````````````````````````````` [Hexadecimal numeric character references](@) consist of `&#` + -either `X` or `x` + a string of 1-8 hexadecimal digits + `;`. +either `X` or `x` + a string of 1-6 hexadecimal digits + `;`. They too are parsed as the corresponding Unicode character (this time specified with a hexadecimal numeral instead of decimal). @@ -5578,9 +5714,13 @@ Here are some nonentities: ```````````````````````````````` example   &x; &#; &#x; +� +&#abcdef0; &ThisIsNotDefined; &hi?; .

&nbsp &x; &#; &#x; +&#987654321; +&#abcdef0; &ThisIsNotDefined; &hi?;

```````````````````````````````` @@ -5661,6 +5801,51 @@ text in code spans and code blocks: ```````````````````````````````` +Entity and numeric character references cannot be used +in place of symbols indicating structure in CommonMark +documents. + +```````````````````````````````` example +*foo* +*foo* +. +

*foo* +foo

+```````````````````````````````` + +```````````````````````````````` example +* foo + +* foo +. +

* foo

+
    +
  • foo
  • +
+```````````````````````````````` + +```````````````````````````````` example +foo bar +. +

foo + +bar

+```````````````````````````````` + +```````````````````````````````` example + foo +. +

→foo

+```````````````````````````````` + + +```````````````````````````````` example +[a](url "tit") +. +

[a](url "tit")

+```````````````````````````````` + + ## Code spans A [backtick string](@) @@ -5669,9 +5854,16 @@ preceded nor followed by a backtick. A [code span](@) begins with a backtick string and ends with a backtick string of equal length. The contents of the code span are -the characters between the two backtick strings, with leading and -trailing spaces and [line endings] removed, and -[whitespace] collapsed to single spaces. +the characters between the two backtick strings, normalized in the +following ways: + +- First, [line endings] are converted to [spaces]. +- If the resulting string both begins *and* ends with a [space] + character, but does not consist entirely of [space] + characters, a single [space] character is removed from the + front and back. This allows you to include code that begins + or ends with backtick characters, which must be separated by + whitespace from the opening or closing backtick strings. This is a simple code span: @@ -5683,10 +5875,11 @@ This is a simple code span: Here two backticks are used, because the code contains a backtick. -This example also illustrates stripping of leading and trailing spaces: +This example also illustrates stripping of a single leading and +trailing space: ```````````````````````````````` example -`` foo ` bar `` +`` foo ` bar `` .

foo ` bar

```````````````````````````````` @@ -5701,58 +5894,79 @@ spaces:

``

```````````````````````````````` +Note that only *one* space is stripped: -[Line endings] are treated like spaces: +```````````````````````````````` example +` `` ` +. +

``

+```````````````````````````````` + +The stripping only happens if the space is on both +sides of the string: ```````````````````````````````` example -`` -foo -`` +` a` . -

foo

+

a

```````````````````````````````` +Only [spaces], and not [unicode whitespace] in general, are +stripped in this way: + +```````````````````````````````` example +` b ` +. +

 b 

+```````````````````````````````` -Interior spaces and [line endings] are collapsed into -single spaces, just as they would be by a browser: +No stripping occurs if the code span contains only spaces: ```````````````````````````````` example -`foo bar - baz` +` ` +` ` . -

foo bar baz

+

  +

```````````````````````````````` -Not all [Unicode whitespace] (for instance, non-breaking space) is -collapsed, however: +[Line endings] are treated like spaces: ```````````````````````````````` example -`a  b` +`` +foo +bar +baz +`` . -

a  b

+

foo bar baz

```````````````````````````````` +```````````````````````````````` example +`` +foo +`` +. +

foo

+```````````````````````````````` -Q: Why not just leave the spaces, since browsers will collapse them -anyway? A: Because we might be targeting a non-HTML format, and we -shouldn't rely on HTML-specific rendering assumptions. -(Existing implementations differ in their treatment of internal -spaces and [line endings]. Some, including `Markdown.pl` and -`showdown`, convert an internal [line ending] into a -`
` tag. But this makes things difficult for those who like to -hard-wrap their paragraphs, since a line break in the midst of a code -span will cause an unintended line break in the output. Others just -leave internal spaces as they are, which is fine if only HTML is being -targeted.) +Interior spaces are not collapsed: ```````````````````````````````` example -`foo `` bar` +`foo bar +baz` . -

foo `` bar

+

foo bar baz

```````````````````````````````` +Note that browsers will typically collapse consecutive spaces +when rendering `` elements, so it is recommended that +the following CSS be used: + + code{white-space: pre-wrap;} + Note that backslash escapes do not work in code spans. All backslashes are treated literally: @@ -5768,6 +5982,19 @@ Backslash escapes are never needed, because one can always choose a string of *n* backtick characters as delimiters, where the code does not contain any strings of exactly *n* backtick characters. +```````````````````````````````` example +``foo`bar`` +. +

foo`bar

+```````````````````````````````` + +```````````````````````````````` example +` foo `` bar ` +. +

foo `` bar

+```````````````````````````````` + + Code span backticks have higher precedence than any other inline constructs except HTML tags and autolinks. Thus, for example, this is not parsed as emphasized text, since the second `*` is part of a code @@ -5905,15 +6132,17 @@ of one or more `_` characters that is not preceded or followed by a non-backslash-escaped `_` character. A [left-flanking delimiter run](@) is -a [delimiter run] that is (a) not followed by [Unicode whitespace], -and (b) not followed by a [punctuation character], or +a [delimiter run] that is (1) not followed by [Unicode whitespace], +and either (2a) not followed by a [punctuation character], or +(2b) followed by a [punctuation character] and preceded by [Unicode whitespace] or a [punctuation character]. For purposes of this definition, the beginning and the end of the line count as Unicode whitespace. A [right-flanking delimiter run](@) is -a [delimiter run] that is (a) not preceded by [Unicode whitespace], -and (b) not preceded by a [punctuation character], or +a [delimiter run] that is (1) not preceded by [Unicode whitespace], +and either (2a) not preceded by a [punctuation character], or +(2b) preceded by a [punctuation character] and followed by [Unicode whitespace] or a [punctuation character]. For purposes of this definition, the beginning and the end of the line count as Unicode whitespace. @@ -6005,7 +6234,8 @@ The following rules define emphasis and strong emphasis: [delimiter runs]. If one of the delimiters can both open and close emphasis, then the sum of the lengths of the delimiter runs containing the opening and closing delimiters - must not be a multiple of 3. + must not be a multiple of 3 unless both lengths are + multiples of 3. 10. Strong emphasis begins with a delimiter that [can open strong emphasis] and ends with a delimiter that @@ -6015,7 +6245,8 @@ The following rules define emphasis and strong emphasis: [delimiter runs]. If one of the delimiters can both open and close strong emphasis, then the sum of the lengths of the delimiter runs containing the opening and closing - delimiters must not be a multiple of 3. + delimiters must not be a multiple of 3 unless both lengths + are multiples of 3. 11. A literal `*` character cannot occur at the beginning or end of `*`-delimited emphasis or `**`-delimited strong emphasis, unless it @@ -6634,7 +6865,19 @@ is precluded by the condition that a delimiter that can both open and close (like the `*` after `foo`) cannot form emphasis if the sum of the lengths of the delimiter runs containing the opening and -closing delimiters is a multiple of 3. +closing delimiters is a multiple of 3 unless +both lengths are multiples of 3. + + +For the same reason, we don't get two consecutive +emphasis sections in this example: + +```````````````````````````````` example +*foo**bar* +. +

foo**bar

+```````````````````````````````` + The same condition ensures that the following cases are all strong emphasis nested inside @@ -6663,6 +6906,23 @@ omitted: ```````````````````````````````` +When the lengths of the interior closing and opening +delimiter runs are *both* multiples of 3, though, +they can match to create emphasis: + +```````````````````````````````` example +foo***bar***baz +. +

foobarbaz

+```````````````````````````````` + +```````````````````````````````` example +foo******bar*********baz +. +

foobar***baz

+```````````````````````````````` + + Indefinite levels of nesting are possible: ```````````````````````````````` example @@ -7198,15 +7458,16 @@ following rules apply: A [link destination](@) consists of either - a sequence of zero or more characters between an opening `<` and a - closing `>` that contains no spaces, line breaks, or unescaped + closing `>` that contains no line breaks or unescaped `<` or `>` characters, or -- a nonempty sequence of characters that does not include - ASCII space or control characters, and includes parentheses - only if (a) they are backslash-escaped or (b) they are part of - a balanced pair of unescaped parentheses. (Implementations - may impose limits on parentheses nesting to avoid performance - issues, but at least three levels of nesting should be supported.) +- a nonempty sequence of characters that does not start with + `<`, does not include ASCII space or control characters, and + includes parentheses only if (a) they are backslash-escaped or + (b) they are part of a balanced pair of unescaped parentheses. + (Implementations may impose limits on parentheses nesting to + avoid performance issues, but at least three levels of nesting + should be supported.) A [link title](@) consists of either @@ -7219,7 +7480,8 @@ A [link title](@) consists of either backslash-escaped, or - a sequence of zero or more characters between matching parentheses - (`(...)`), including a `)` character only if it is backslash-escaped. + (`(...)`), including a `(` or `)` character only if it is + backslash-escaped. Although [link titles] may span multiple lines, they may not contain a [blank line]. @@ -7269,9 +7531,8 @@ Both the title and the destination may be omitted:

link

```````````````````````````````` - -The destination cannot contain spaces or line breaks, -even if enclosed in pointy brackets: +The destination can only contain spaces if it is +enclosed in pointy brackets: ```````````````````````````````` example [link](/my uri) @@ -7279,13 +7540,14 @@ even if enclosed in pointy brackets:

[link](/my uri)

```````````````````````````````` - ```````````````````````````````` example [link](
) . -

[link](</my uri>)

+

link

```````````````````````````````` +The destination cannot contain line breaks, +even if enclosed in pointy brackets: ```````````````````````````````` example [link](foo @@ -7295,7 +7557,6 @@ bar) bar)

```````````````````````````````` - ```````````````````````````````` example [link]() @@ -7304,6 +7565,36 @@ bar>) bar>)

```````````````````````````````` +The destination can contain `)` if it is enclosed +in pointy brackets: + +```````````````````````````````` example +[a]() +. +

a

+```````````````````````````````` + +Pointy brackets that enclose links must be unescaped: + +```````````````````````````````` example +[link]() +. +

[link](<foo>)

+```````````````````````````````` + +These are not links, because the opening pointy bracket +is not matched properly: + +```````````````````````````````` example +[a]( +[a](c) +. +

[a](<b)c +[a](<b)c> +[a](c)

+```````````````````````````````` + Parentheses inside the link destination may be escaped: ```````````````````````````````` example @@ -8411,7 +8702,7 @@ If you want a link after a literal `!`, backslash-escape the as the link label. A [URI autolink](@) consists of `<`, followed by an -[absolute URI] not containing `<`, followed by `>`. It is parsed as +[absolute URI] followed by `>`. It is parsed as a link to the URI, with the URI as the link's label. An [absolute URI](@), @@ -8624,7 +8915,7 @@ a [single-quoted attribute value], or a [double-quoted attribute value]. An [unquoted attribute value](@) is a nonempty string of characters not -including spaces, `"`, `'`, `=`, `<`, `>`, or `` ` ``. +including [whitespace], `"`, `'`, `=`, `<`, `>`, or `` ` ``. A [single-quoted attribute value](@) consists of `'`, zero or more @@ -8745,9 +9036,13 @@ Illegal [whitespace]: ```````````````````````````````` example < a>< foo> + .

< a>< -foo><bar/ >

+foo><bar/ > +<foo bar=baz +bim!bop />

```````````````````````````````` @@ -8944,10 +9239,10 @@ bar

Line breaks do not occur inside code spans ```````````````````````````````` example -`code +`code span` . -

code span

+

code span

```````````````````````````````` @@ -9365,7 +9660,8 @@ just above `stack_bottom` (or the first element if `stack_bottom` is NULL). We keep track of the `openers_bottom` for each delimiter -type (`*`, `_`). Initialize this to `stack_bottom`. +type (`*`, `_`) and each length of the closing delimiter run +(modulo 3). Initialize this to `stack_bottom`. Then we repeat the following until we run out of potential closers: @@ -9397,7 +9693,7 @@ closers: of the delimiter stack. If the closing node is removed, reset `current_position` to the next element in the stack. -- If none in found: +- If none is found: + Set `openers_bottom` to the element before `current_position`. (We know that there are no openers for this kind of closer up to and From d08f5dca84f810671520ea8fb777706aa2dc9a01 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Apr 2019 15:01:54 +1000 Subject: [PATCH 02/22] Change how newlines/spaces are handled in inline code (spec 0.29) --- .../commonmark/internal/InlineParserImpl.java | 14 ++++++++-- .../org/commonmark/internal/util/Parsing.java | 6 +++++ .../org/commonmark/test/SpecCrLfCoreTest.java | 26 +++++++++++++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 commonmark/src/test/java/org/commonmark/test/SpecCrLfCoreTest.java diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java index c6bbacc1a..bcb21415d 100644 --- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java +++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java @@ -419,8 +419,18 @@ private Node parseBackticks() { if (matched.equals(ticks)) { Code node = new Code(); String content = input.substring(afterOpenTicks, index - ticks.length()); - String literal = WHITESPACE.matcher(content.trim()).replaceAll(" "); - node.setLiteral(literal); + content = content.replace('\n', ' '); + + // spec: If the resulting string both begins and ends with a space character, but does not consist + // entirely of space characters, a single space character is removed from the front and back. + if (content.length() >= 3 && + content.charAt(0) == ' ' && + content.charAt(content.length() - 1) == ' ' && + Parsing.hasNonSpace(content)) { + content = content.substring(1, content.length() - 1); + } + + node.setLiteral(content); return node; } } diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java b/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java index f5cc888ee..94f77858d 100644 --- a/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java +++ b/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java @@ -50,6 +50,12 @@ public static boolean isBlank(CharSequence s) { return findNonSpace(s, 0) == -1; } + public static boolean hasNonSpace(CharSequence s) { + int length = s.length(); + int skipped = skip(' ', s, 0, length); + return skipped != length; + } + public static boolean isLetter(CharSequence s, int index) { int codePoint = Character.codePointAt(s, index); return Character.isLetter(codePoint); diff --git a/commonmark/src/test/java/org/commonmark/test/SpecCrLfCoreTest.java b/commonmark/src/test/java/org/commonmark/test/SpecCrLfCoreTest.java new file mode 100644 index 000000000..6424ab659 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/SpecCrLfCoreTest.java @@ -0,0 +1,26 @@ +package org.commonmark.test; + +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.SpecTestCase; +import org.commonmark.testutil.example.Example; + +/** + * Same as {@link SpecCoreTest} but converts line endings to Windows-style CR+LF endings before parsing. + */ +public class SpecCrLfCoreTest extends SpecTestCase { + + private static final Parser PARSER = Parser.builder().build(); + // The spec says URL-escaping is optional, but the examples assume that it's enabled. + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().percentEncodeUrls(true).build(); + + public SpecCrLfCoreTest(Example example) { + super(example); + } + + @Override + protected String render(String source) { + String windowsStyle = source.replace("\n", "\r\n"); + return RENDERER.render(PARSER.parse(windowsStyle)); + } +} From 62d5f1068fa3084a0f3757f54658e24336c6bd84 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Apr 2019 15:06:14 +1000 Subject: [PATCH 03/22] Info strings for tilde code blocks can contain backticks and tildes (spec 0.29) See https://github.com/commonmark/commonmark-spec/issues/119 --- .../org/commonmark/internal/FencedCodeBlockParser.java | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/commonmark/src/main/java/org/commonmark/internal/FencedCodeBlockParser.java b/commonmark/src/main/java/org/commonmark/internal/FencedCodeBlockParser.java index 6352892cf..e57cc7277 100644 --- a/commonmark/src/main/java/org/commonmark/internal/FencedCodeBlockParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/FencedCodeBlockParser.java @@ -102,17 +102,13 @@ private static FencedCodeBlockParser checkOpener(CharSequence line, int index, i } } if (backticks >= 3 && tildes == 0) { - // spec: The info string may not contain any backtick characters. + // spec: If the info string comes after a backtick fence, it may not contain any backtick characters. if (Parsing.find('`', line, index + backticks) != -1) { return null; } return new FencedCodeBlockParser('`', backticks, indent); } else if (tildes >= 3 && backticks == 0) { - // This follows commonmark.js but the spec is unclear about this: - // https://github.com/commonmark/CommonMark/issues/119 - if (Parsing.find('~', line, index + tildes) != -1) { - return null; - } + // spec: Info strings for tilde code blocks can contain backticks and tildes return new FencedCodeBlockParser('~', tildes, indent); } else { return null; From 9e10d399161b19a6b0dbe1bba192bb009f9bd996 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Apr 2019 15:17:43 +1000 Subject: [PATCH 04/22] Allow internal delim runs to match if both have lengths that are multiples of 3 (spec 0.29) --- .../internal/inline/EmphasisDelimiterProcessor.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/EmphasisDelimiterProcessor.java b/commonmark/src/main/java/org/commonmark/internal/inline/EmphasisDelimiterProcessor.java index 60f192bef..98b43938c 100644 --- a/commonmark/src/main/java/org/commonmark/internal/inline/EmphasisDelimiterProcessor.java +++ b/commonmark/src/main/java/org/commonmark/internal/inline/EmphasisDelimiterProcessor.java @@ -33,7 +33,9 @@ public int getMinLength() { @Override public int getDelimiterUse(DelimiterRun opener, DelimiterRun closer) { // "multiple of 3" rule for internal delimiter runs - if ((opener.canClose() || closer.canOpen()) && (opener.originalLength() + closer.originalLength()) % 3 == 0) { + if ((opener.canClose() || closer.canOpen()) && + closer.originalLength() % 3 != 0 && + (opener.originalLength() + closer.originalLength()) % 3 == 0) { return 0; } // calculate actual number of delimiters used from this closer From a3cc5f08860a75c4fc96be77379995ff80374d34 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Apr 2019 15:38:11 +1000 Subject: [PATCH 05/22] Fix pathological case with input `[\\\\...` (a lot of backslashes) --- .../java/org/commonmark/internal/InlineParserImpl.java | 3 ++- .../test/java/org/commonmark/test/PathologicalTest.java | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java index bcb21415d..d236cca31 100644 --- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java +++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java @@ -39,7 +39,8 @@ public class InlineParserImpl implements InlineParser, ReferenceParser { private static final Pattern LINK_DESTINATION_BRACES = Pattern.compile("^(?:[<](?:[^<> \\t\\n\\\\]|\\\\.)*[>])"); - private static final Pattern LINK_LABEL = Pattern.compile("^\\[(?:[^\\\\\\[\\]]|\\\\.)*\\]"); + private static final Pattern LINK_LABEL = Pattern.compile( + "^\\[(?:[^\\\\\\[\\]]|\\\\.){0,1000}\\]"); private static final Pattern ESCAPABLE = Pattern.compile('^' + Escaping.ESCAPABLE); diff --git a/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java b/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java index 8c5d57dd4..ddf44e268 100644 --- a/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java +++ b/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java @@ -106,4 +106,11 @@ public void hugeHorizontalRule() { repeat("*", 10000) + "\n", "
\n"); } + + @Test + public void backslashInLink() { + // See https://github.com/commonmark/commonmark.js/issues/157 + assertRendering("[" + repeat("\\", x) + "\n", + "

" + "[" + repeat("\\", x / 2) + "

\n"); + } } From e368db33d279b04ec779273c24e7ca4e952825e3 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Apr 2019 15:45:32 +1000 Subject: [PATCH 06/22] Fix pathological case with input `[]([]([](...` See https://github.com/commonmark/commonmark.js/issues/129 --- .../commonmark/internal/InlineParserImpl.java | 21 ++++++++++++------- .../org/commonmark/test/PathologicalTest.java | 7 +++++++ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java index d236cca31..c75a9bb02 100644 --- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java +++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java @@ -638,18 +638,21 @@ private String parseLinkDestination() { } } else { int startIndex = index; - parseLinkDestinationWithBalancedParens(); - return Escaping.unescapeString(input.substring(startIndex, index)); + if (parseLinkDestinationWithBalancedParens()) { + return Escaping.unescapeString(input.substring(startIndex, index)); + } else { + return null; + } } } - private void parseLinkDestinationWithBalancedParens() { + private boolean parseLinkDestinationWithBalancedParens() { int parens = 0; while (true) { char c = peek(); switch (c) { case '\0': - return; + return true; case '\\': // check if we have an escapable character if (index + 1 < input.length() && ESCAPABLE.matcher(input.substring(index + 1, index + 2)).matches()) { @@ -661,21 +664,25 @@ private void parseLinkDestinationWithBalancedParens() { break; case '(': parens++; + // Limit to 32 nested parens for pathological cases + if (parens > 32) { + return false; + } break; case ')': if (parens == 0) { - return; + return true; } else { parens--; } break; case ' ': // ASCII space - return; + return true; default: // or control character if (Character.isISOControl(c)) { - return; + return true; } } index++; diff --git a/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java b/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java index ddf44e268..a853b1b11 100644 --- a/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java +++ b/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java @@ -113,4 +113,11 @@ public void backslashInLink() { assertRendering("[" + repeat("\\", x) + "\n", "

" + "[" + repeat("\\", x / 2) + "

\n"); } + + @Test + public void unclosedInlineLinks() { + // See https://github.com/commonmark/commonmark.js/issues/129 + assertRendering(repeat("[](", x) + "\n", + "

" + repeat("[](", x) + "

\n"); + } } From ccff6918b68fd7951c021ea1a6bd7e207421b2e9 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Apr 2019 16:21:16 +1000 Subject: [PATCH 07/22] Changes to link destination parsing (spec 0.29) * Allow spaces inside link destinations in pointy brackets * Disallow link destination beginning with `<` unless it is inside `<..>` --- .../commonmark/internal/InlineParserImpl.java | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java index c75a9bb02..da3b93ab4 100644 --- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java +++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java @@ -37,7 +37,8 @@ public class InlineParserImpl implements InlineParser, ReferenceParser { '|' + "\\((" + ESCAPED_CHAR + "|[^)\\x00])*\\))"); - private static final Pattern LINK_DESTINATION_BRACES = Pattern.compile("^(?:[<](?:[^<> \\t\\n\\\\]|\\\\.)*[>])"); + private static final Pattern LINK_DESTINATION_BRACES = Pattern.compile( + "^(?:[<](?:[^<>\n\\\\\\x00]|\\\\.)*[>])"); private static final Pattern LINK_LABEL = Pattern.compile( "^\\[(?:[^\\\\\\[\\]]|\\\\.){0,1000}\\]"); @@ -190,7 +191,7 @@ public int parseReference(String s) { this.input = s; this.index = 0; String dest; - String title; + String title = null; int matchChars; int startIndex = index; @@ -212,13 +213,15 @@ public int parseReference(String s) { spnl(); dest = parseLinkDestination(); - if (dest == null || dest.length() == 0) { + if (dest == null) { return 0; } int beforeTitle = index; spnl(); - title = parseLinkTitle(); + if (index != beforeTitle) { + title = parseLinkTitle(); + } if (title == null) { // rewind before spaces index = beforeTitle; @@ -637,6 +640,9 @@ private String parseLinkDestination() { return Escaping.unescapeString(res.substring(1, res.length() - 1)); } } else { + if (peek() == '<') { + return null; + } int startIndex = index; if (parseLinkDestinationWithBalancedParens()) { return Escaping.unescapeString(input.substring(startIndex, index)); @@ -647,12 +653,14 @@ private String parseLinkDestination() { } private boolean parseLinkDestinationWithBalancedParens() { + int startIndex = index; int parens = 0; while (true) { char c = peek(); switch (c) { case '\0': - return true; + case ' ': + return startIndex != index; case '\\': // check if we have an escapable character if (index + 1 < input.length() && ESCAPABLE.matcher(input.substring(index + 1, index + 2)).matches()) { @@ -676,13 +684,10 @@ private boolean parseLinkDestinationWithBalancedParens() { parens--; } break; - case ' ': - // ASCII space - return true; default: // or control character if (Character.isISOControl(c)) { - return true; + return startIndex != index; } } index++; From 79c0a7cc6d64c541d17a2729dec28cf07dabd43e Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Apr 2019 16:26:32 +1000 Subject: [PATCH 08/22] Disallow unescaped '(' in link title See https://github.com/commonmark/commonmark-spec/issues/526 --- .../src/main/java/org/commonmark/internal/InlineParserImpl.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java index da3b93ab4..d7d630193 100644 --- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java +++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java @@ -35,7 +35,7 @@ public class InlineParserImpl implements InlineParser, ReferenceParser { '|' + "'(" + ESCAPED_CHAR + "|[^'\\x00])*'" + '|' + - "\\((" + ESCAPED_CHAR + "|[^)\\x00])*\\))"); + "\\((" + ESCAPED_CHAR + "|[^()\\x00])*\\))"); private static final Pattern LINK_DESTINATION_BRACES = Pattern.compile( "^(?:[<](?:[^<>\n\\\\\\x00]|\\\\.)*[>])"); From 8e363803253a931c638ab420d8a8558a0a974f00 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Mon, 15 Apr 2019 16:35:35 +1000 Subject: [PATCH 09/22] Adapt link reference definition parsing to spec changes The spec was updated to clarify that they can also be part of setext headings. The way we make these work in our parser is to be able to "replace" the active paragraph block. In that case, we still need to collect link reference definitions from it. In order to implement this, I had to separate it from InlineParser. The new way is cleaner, and allows us to add a Node to the document as well (for #98). I think there's still room for improvement: We could parse the definition as we go in ParagraphParser and collect them earlier. That might eliminate the current "double parsing" that we do in some cases. --- CHANGELOG.md | 5 + .../commonmark/internal/DocumentParser.java | 78 +++++++++-- .../internal/InlineParserContextImpl.java | 30 +++++ .../commonmark/internal/InlineParserImpl.java | 125 ++++-------------- .../LinkReferenceDefinitionParser.java | 124 +++++++++++++++++ .../commonmark/internal/ParagraphParser.java | 26 +--- .../commonmark/internal/ReferenceParser.java | 11 -- .../org/commonmark/node/AbstractVisitor.java | 5 + .../node/LinkReferenceDefinition.java | 60 +++++++++ .../java/org/commonmark/node/Visitor.java | 4 +- .../parser/InlineParserContext.java | 16 ++- .../java/org/commonmark/parser/Parser.java | 54 ++++---- 12 files changed, 365 insertions(+), 173 deletions(-) create mode 100644 commonmark/src/main/java/org/commonmark/internal/InlineParserContextImpl.java create mode 100644 commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java delete mode 100644 commonmark/src/main/java/org/commonmark/internal/ReferenceParser.java create mode 100644 commonmark/src/main/java/org/commonmark/node/LinkReferenceDefinition.java diff --git a/CHANGELOG.md b/CHANGELOG.md index cb5f8c178..17e470514 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,12 @@ This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html with the exception that 0.x versions can break between minor versions. ## Unreleased +### Added +- `InlineParserContext.getLinkReferenceDefinition` was added to allow + custom inline parsers to look up definitions for reference links. ### Changed +- Link reference definition parsing has been changed according to the + spec: Definitions can now be in setext headings too. - Check non-null arguments early and provide a nicer message ## [0.12.1] - 2018-11-13 diff --git a/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java b/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java index 539516c1d..b716286dc 100644 --- a/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java @@ -3,7 +3,9 @@ import org.commonmark.internal.util.Parsing; import org.commonmark.node.*; import org.commonmark.parser.InlineParser; +import org.commonmark.parser.InlineParserFactory; import org.commonmark.parser.block.*; +import org.commonmark.parser.delimiter.DelimiterProcessor; import java.io.BufferedReader; import java.io.IOException; @@ -59,17 +61,23 @@ public class DocumentParser implements ParserState { private boolean blank; private final List blockParserFactories; - private final InlineParser inlineParser; + private final InlineParserFactory inlineParserFactory; + private final List delimiterProcessors; private final DocumentBlockParser documentBlockParser; + private final LinkReferenceDefinitionParser linkReferenceDefinitionParser; private List activeBlockParsers = new ArrayList<>(); - private Set allBlockParsers = new HashSet<>(); + // LinkedHashSet to have a deterministic order + private Set allBlockParsers = new LinkedHashSet<>(); - public DocumentParser(List blockParserFactories, InlineParser inlineParser) { + public DocumentParser(List blockParserFactories, InlineParserFactory inlineParserFactory, + List delimiterProcessors) { this.blockParserFactories = blockParserFactories; - this.inlineParser = inlineParser; + this.inlineParserFactory = inlineParserFactory; + this.delimiterProcessors = delimiterProcessors; this.documentBlockParser = new DocumentBlockParser(); + this.linkReferenceDefinitionParser = new LinkReferenceDefinitionParser(); activateBlockParser(this.documentBlockParser); } @@ -233,7 +241,7 @@ private void incorporateLine(CharSequence ln) { } if (blockStart.isReplaceActiveBlockParser()) { - removeActiveBlockParser(); + prepareActiveBlockParserForReplacement(); } for (BlockParser newBlockParser : blockStart.getBlockParsers()) { @@ -386,10 +394,26 @@ private void finalize(BlockParser blockParser) { blockParser.closeBlock(); - if (blockParser instanceof ParagraphParser - && inlineParser instanceof ReferenceParser) { + if (blockParser instanceof ParagraphParser) { ParagraphParser paragraphParser = (ParagraphParser) blockParser; - paragraphParser.closeBlock((ReferenceParser) inlineParser); + String content = paragraphParser.getContentString(); + + Block paragraph = paragraphParser.getBlock(); + + // TODO: Insert resulting nodes into AST (before paragraph node) + int afterDefinitions = linkReferenceDefinitionParser.parseDefinitions(content); + + if (afterDefinitions != 0) { + String remainingContent = content.substring(afterDefinitions); + if (Parsing.isBlank(remainingContent)) { + // If all we had was reference definitions, remove the block that's now empty + paragraph.unlink(); + paragraphParser.setContentString(""); + } else { + // We had some content after the definitions, use that for the paragraph + paragraphParser.setContentString(remainingContent); + } + } } } @@ -397,6 +421,17 @@ private void finalize(BlockParser blockParser) { * Walk through a block & children recursively, parsing string content into inline content where appropriate. */ private void processInlines() { + Map definitions = new LinkedHashMap<>(); + for (LinkReferenceDefinition definition : linkReferenceDefinitionParser.getDefinitions()) { + String label = definition.getLabel(); + // spec: When there are multiple matching link reference definitions, the first is used + if (!definitions.containsKey(label)) { + definitions.put(label, definition); + } + } + InlineParserContextImpl context = new InlineParserContextImpl(delimiterProcessors, definitions); + InlineParser inlineParser = inlineParserFactory.create(context); + for (BlockParser blockParser : allBlockParsers) { blockParser.parseInlines(inlineParser); } @@ -426,11 +461,21 @@ private void deactivateBlockParser() { activeBlockParsers.remove(activeBlockParsers.size() - 1); } - private void removeActiveBlockParser() { + private void prepareActiveBlockParserForReplacement() { BlockParser old = getActiveBlockParser(); deactivateBlockParser(); allBlockParsers.remove(old); + if (old instanceof ParagraphParser) { + String content = ((ParagraphParser) old).getContentString(); + // Collect any link reference definitions. Note that replacing the active block parser is done after a + // block parser got the current paragraph content using MatchedBlockParser#getContentString. In our + // implementation of that, we strip link reference definitions from the paragraph content before we give it + // to the block parser. We want to keep them. If no replacement happens, we collect the definitions as part + // of finalizing paragraph blocks. + linkReferenceDefinitionParser.parseDefinitions(content); + } + old.getBlock().unlink(); } @@ -467,7 +512,20 @@ public BlockParser getMatchedBlockParser() { public CharSequence getParagraphContent() { if (matchedBlockParser instanceof ParagraphParser) { ParagraphParser paragraphParser = (ParagraphParser) matchedBlockParser; - return paragraphParser.getContentString(); + String content = paragraphParser.getContentString(); + + // Strip link reference definitions, they are not going to be part of the paragraph text. + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + int afterDefinitions = parser.parseDefinitions(content); + if (afterDefinitions != 0) { + content = content.substring(afterDefinitions); + if (Parsing.isBlank(content)) { + // Paragraph consists only of link reference definitions -> no actual paragraph content + return null; + } + } + + return content; } return null; } diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserContextImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserContextImpl.java new file mode 100644 index 000000000..bff085ad8 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserContextImpl.java @@ -0,0 +1,30 @@ +package org.commonmark.internal; + +import org.commonmark.node.LinkReferenceDefinition; +import org.commonmark.parser.InlineParserContext; +import org.commonmark.parser.delimiter.DelimiterProcessor; + +import java.util.List; +import java.util.Map; + +public class InlineParserContextImpl implements InlineParserContext { + + private final List delimiterProcessors; + private final Map linkReferenceDefinitions; + + public InlineParserContextImpl(List delimiterProcessors, + Map linkReferenceDefinitions) { + this.delimiterProcessors = delimiterProcessors; + this.linkReferenceDefinitions = linkReferenceDefinitions; + } + + @Override + public List getCustomDelimiterProcessors() { + return delimiterProcessors; + } + + @Override + public LinkReferenceDefinition getLinkReferenceDefinition(String label) { + return linkReferenceDefinitions.get(label); + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java index d7d630193..ad93b586e 100644 --- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java +++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java @@ -7,13 +7,14 @@ import org.commonmark.internal.util.Parsing; import org.commonmark.node.*; import org.commonmark.parser.InlineParser; +import org.commonmark.parser.InlineParserContext; import org.commonmark.parser.delimiter.DelimiterProcessor; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; -public class InlineParserImpl implements InlineParser, ReferenceParser { +public class InlineParserImpl implements InlineParser { private static final String ESCAPED_CHAR = "\\\\" + Escaping.ESCAPABLE; private static final String HTMLCOMMENT = "|"; @@ -65,19 +66,13 @@ public class InlineParserImpl implements InlineParser, ReferenceParser { private static final Pattern FINAL_SPACE = Pattern.compile(" *$"); - private static final Pattern LINE_END = Pattern.compile("^ *(?:\n|$)"); - private final BitSet specialCharacters; private final BitSet delimiterCharacters; private final Map delimiterProcessors; + private final InlineParserContext context; - /** - * Link references by ID, needs to be built up using parseReference before calling parse. - */ - private Map referenceMap = new HashMap<>(); - - private String input; - private int index; + String input; + int index; /** * Top delimiter (emphasis, strong emphasis or custom emphasis). (Brackets are on a separate stack, different @@ -90,10 +85,12 @@ public class InlineParserImpl implements InlineParser, ReferenceParser { */ private Bracket lastBracket; - public InlineParserImpl(List delimiterProcessors) { - this.delimiterProcessors = calculateDelimiterProcessors(delimiterProcessors); + public InlineParserImpl(InlineParserContext inlineParserContext) { + this.delimiterProcessors = calculateDelimiterProcessors(inlineParserContext.getCustomDelimiterProcessors()); this.delimiterCharacters = calculateDelimiterCharacters(this.delimiterProcessors.keySet()); this.specialCharacters = calculateSpecialCharacters(delimiterCharacters); + + this.context = inlineParserContext; } public static BitSet calculateDelimiterCharacters(Set characters) { @@ -163,10 +160,7 @@ private static void addDelimiterProcessorForChar(char delimiterChar, DelimiterPr */ @Override public void parse(String content, Node block) { - this.input = content.trim(); - this.index = 0; - this.lastDelimiter = null; - this.lastBracket = null; + reset(content.trim()); Node previous = null; while (true) { @@ -183,82 +177,14 @@ public void parse(String content, Node block) { mergeChildTextNodes(block); } - /** - * Attempt to parse a link reference, modifying the internal reference map. - */ - @Override - public int parseReference(String s) { - this.input = s; + void reset(String content) { + this.input = content; this.index = 0; - String dest; - String title = null; - int matchChars; - int startIndex = index; - - // label: - matchChars = parseLinkLabel(); - if (matchChars == 0) { - return 0; - } - - String rawLabel = input.substring(0, matchChars); - - // colon: - if (peek() != ':') { - return 0; - } - index++; - - // link url - spnl(); - - dest = parseLinkDestination(); - if (dest == null) { - return 0; - } - - int beforeTitle = index; - spnl(); - if (index != beforeTitle) { - title = parseLinkTitle(); - } - if (title == null) { - // rewind before spaces - index = beforeTitle; - } - - boolean atLineEnd = true; - if (index != input.length() && match(LINE_END) == null) { - if (title == null) { - atLineEnd = false; - } else { - // the potential title we found is not at the line end, - // but it could still be a legal link reference if we - // discard the title - title = null; - // rewind before spaces - index = beforeTitle; - // and instead check if the link URL is at the line end - atLineEnd = match(LINE_END) != null; - } - } - - if (!atLineEnd) { - return 0; - } - - String normalizedLabel = Escaping.normalizeReference(rawLabel); - if (normalizedLabel.isEmpty()) { - return 0; - } - - if (!referenceMap.containsKey(normalizedLabel)) { - Link link = new Link(dest, title); - referenceMap.put(normalizedLabel, link); - } - return index - startIndex; + this.lastDelimiter = null; + this.lastBracket = null; } + private Text text(String text, int beginIndex, int endIndex) { return new Text(text.substring(beginIndex, endIndex)); } @@ -331,7 +257,7 @@ private Node parseInline(Node previous) { /** * If RE matches at current index in the input, advance index and return the match; otherwise return null. */ - private String match(Pattern re) { + String match(Pattern re) { if (index >= input.length()) { return null; } @@ -349,7 +275,7 @@ private String match(Pattern re) { /** * Returns the char at the current input index, or {@code '\0'} in case there are no more characters. */ - private char peek() { + char peek() { if (index < input.length()) { return input.charAt(index); } else { @@ -360,7 +286,7 @@ private char peek() { /** * Parse zero or more space characters, including at most one newline. */ - private void spnl() { + void spnl() { match(SPNL); } @@ -568,10 +494,11 @@ private Node parseCloseBracket() { } if (ref != null) { - Link link = referenceMap.get(Escaping.normalizeReference(ref)); - if (link != null) { - dest = link.getDestination(); - title = link.getTitle(); + String label = Escaping.normalizeReference(ref); + LinkReferenceDefinition definition = context.getLinkReferenceDefinition(label); + if (definition != null) { + dest = definition.getDestination(); + title = definition.getTitle(); isLinkOrImage = true; } } @@ -631,7 +558,7 @@ private void removeLastBracket() { /** * Attempt to parse link destination, returning the string or null if no match. */ - private String parseLinkDestination() { + String parseLinkDestination() { String res = match(LINK_DESTINATION_BRACES); if (res != null) { // chop off surrounding <..>: if (res.length() == 2) { @@ -697,7 +624,7 @@ private boolean parseLinkDestinationWithBalancedParens() { /** * Attempt to parse link title (sans quotes), returning the string or null if no match. */ - private String parseLinkTitle() { + String parseLinkTitle() { String title = match(LINK_TITLE); if (title != null) { // chop off quotes from title and unescape: @@ -710,7 +637,7 @@ private String parseLinkTitle() { /** * Attempt to parse a link label, returning number of characters parsed. */ - private int parseLinkLabel() { + int parseLinkLabel() { String m = match(LINK_LABEL); // Spec says "A link label can have at most 999 characters inside the square brackets" if (m == null || m.length() > 1001) { diff --git a/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java new file mode 100644 index 000000000..200f67d3c --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java @@ -0,0 +1,124 @@ +package org.commonmark.internal; + +import org.commonmark.internal.util.Escaping; +import org.commonmark.node.LinkReferenceDefinition; +import org.commonmark.parser.delimiter.DelimiterProcessor; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.regex.Pattern; + +/** + * @see Link reference definitions + */ +class LinkReferenceDefinitionParser extends InlineParserImpl { + + /** + * Parsed link reference definitions by label, in order of occurrence. + */ + private List definitions = new ArrayList<>(); + + private static final Pattern LINE_END = Pattern.compile("^ *(?:\n|$)"); + + LinkReferenceDefinitionParser() { + // Not needed for parsing link reference definitions + super(new InlineParserContextImpl(Collections.emptyList(), + Collections.emptyMap())); + } + + // TODO: Would be better to just return them from the method. + List getDefinitions() { + return definitions; + } + + /** + * Parse all link reference definitions, add them to the map and return the length of the text we parsed (if any). + */ + int parseDefinitions(String content) { + int afterAllDefinitions = 0; + while (content.length() > 3 && content.charAt(0) == '[') { + int afterDefinition = parseDefinition(content); + if (afterDefinition != 0) { + content = content.substring(afterDefinition); + afterAllDefinitions += afterDefinition; + } else { + break; + } + } + return afterAllDefinitions; + } + + /** + * Attempt to parse a single link reference definition, adding it to the map. + */ + private int parseDefinition(String content) { + reset(content); + + String dest; + String title = null; + int matchChars; + int startIndex = index; + + // label: + matchChars = parseLinkLabel(); + if (matchChars == 0) { + return 0; + } + + String rawLabel = input.substring(0, matchChars); + + // colon: + if (peek() != ':') { + return 0; + } + index++; + + // link url + spnl(); + + dest = parseLinkDestination(); + if (dest == null) { + return 0; + } + + int beforeTitle = index; + spnl(); + if (index != beforeTitle) { + title = parseLinkTitle(); + } + if (title == null) { + // rewind before spaces + index = beforeTitle; + } + + boolean atLineEnd = true; + if (index != input.length() && match(LINE_END) == null) { + if (title == null) { + atLineEnd = false; + } else { + // the potential title we found is not at the line end, + // but it could still be a legal link reference if we + // discard the title + title = null; + // rewind before spaces + index = beforeTitle; + // and instead check if the link URL is at the line end + atLineEnd = match(LINE_END) != null; + } + } + + if (!atLineEnd) { + return 0; + } + + String normalizedLabel = Escaping.normalizeReference(rawLabel); + if (normalizedLabel.isEmpty()) { + return 0; + } + + definitions.add(new LinkReferenceDefinition(normalizedLabel, dest, title)); + + return index - startIndex; + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java b/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java index fac8cfadc..f6556ab2e 100644 --- a/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java @@ -1,11 +1,10 @@ package org.commonmark.internal; -import org.commonmark.internal.util.Parsing; import org.commonmark.node.Block; import org.commonmark.node.Paragraph; +import org.commonmark.parser.InlineParser; import org.commonmark.parser.block.AbstractBlockParser; import org.commonmark.parser.block.BlockContinue; -import org.commonmark.parser.InlineParser; import org.commonmark.parser.block.ParserState; public class ParagraphParser extends AbstractBlockParser { @@ -36,25 +35,6 @@ public void addLine(CharSequence line) { public void closeBlock() { } - public void closeBlock(ReferenceParser inlineParser) { - String contentString = content.getString(); - boolean hasReferenceDefs = false; - - int pos; - // try parsing the beginning as link reference definitions: - while (contentString.length() > 3 && contentString.charAt(0) == '[' && - (pos = inlineParser.parseReference(contentString)) != 0) { - contentString = contentString.substring(pos); - hasReferenceDefs = true; - } - if (hasReferenceDefs && Parsing.isBlank(contentString)) { - block.unlink(); - content = null; - } else { - content = new BlockContent(contentString); - } - } - @Override public void parseInlines(InlineParser inlineParser) { if (content != null) { @@ -65,4 +45,8 @@ public void parseInlines(InlineParser inlineParser) { public String getContentString() { return content.getString(); } + + void setContentString(String contentString) { + content = new BlockContent(contentString); + } } diff --git a/commonmark/src/main/java/org/commonmark/internal/ReferenceParser.java b/commonmark/src/main/java/org/commonmark/internal/ReferenceParser.java deleted file mode 100644 index 35f36cb59..000000000 --- a/commonmark/src/main/java/org/commonmark/internal/ReferenceParser.java +++ /dev/null @@ -1,11 +0,0 @@ -package org.commonmark.internal; - -/** - * Parser for inline references - */ -public interface ReferenceParser { - /** - * @return how many characters were parsed as a reference, {@code 0} if none - */ - int parseReference(String s); -} diff --git a/commonmark/src/main/java/org/commonmark/node/AbstractVisitor.java b/commonmark/src/main/java/org/commonmark/node/AbstractVisitor.java index 381c72b66..7edd635d7 100644 --- a/commonmark/src/main/java/org/commonmark/node/AbstractVisitor.java +++ b/commonmark/src/main/java/org/commonmark/node/AbstractVisitor.java @@ -108,6 +108,11 @@ public void visit(Text text) { visitChildren(text); } + @Override + public void visit(LinkReferenceDefinition linkReferenceDefinition) { + visitChildren(linkReferenceDefinition); + } + @Override public void visit(CustomBlock customBlock) { visitChildren(customBlock); diff --git a/commonmark/src/main/java/org/commonmark/node/LinkReferenceDefinition.java b/commonmark/src/main/java/org/commonmark/node/LinkReferenceDefinition.java new file mode 100644 index 000000000..a4578e99b --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/node/LinkReferenceDefinition.java @@ -0,0 +1,60 @@ +package org.commonmark.node; + +// TODO: We're currently not adding these to the document. +// But that would be very useful for being able to render Nodes back to Markdown, see #98. + +/** + * A link reference definition, e.g.: + *

+ * [foo]: /url "title"
+ * 
+ *

+ * They can be referenced anywhere else in the document to produce a link using [foo]. The definitions + * themselves are usually not rendered in the final output. + * + * @see Link reference definitions + */ +public class LinkReferenceDefinition extends Node { + + private String label; + private String destination; + private String title; + + public LinkReferenceDefinition() { + } + + public LinkReferenceDefinition(String label, String destination, String title) { + this.label = label; + this.destination = destination; + this.title = title; + } + + public String getLabel() { + return label; + } + + public void setLabel(String label) { + this.label = label; + } + + public String getDestination() { + return destination; + } + + public void setDestination(String destination) { + this.destination = destination; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + @Override + public void accept(Visitor visitor) { + visitor.visit(this); + } +} diff --git a/commonmark/src/main/java/org/commonmark/node/Visitor.java b/commonmark/src/main/java/org/commonmark/node/Visitor.java index 8851b7b18..a155296f0 100644 --- a/commonmark/src/main/java/org/commonmark/node/Visitor.java +++ b/commonmark/src/main/java/org/commonmark/node/Visitor.java @@ -3,7 +3,7 @@ /** * Node visitor. *

- * See {@link AbstractVisitor} for a base class that can be extended. + * Implementations should subclass {@link AbstractVisitor} instead of implementing this directly. */ public interface Visitor { @@ -47,6 +47,8 @@ public interface Visitor { void visit(Text text); + void visit(LinkReferenceDefinition linkReferenceDefinition); + void visit(CustomBlock customBlock); void visit(CustomNode customNode); diff --git a/commonmark/src/main/java/org/commonmark/parser/InlineParserContext.java b/commonmark/src/main/java/org/commonmark/parser/InlineParserContext.java index 7a3be522d..467742e2c 100644 --- a/commonmark/src/main/java/org/commonmark/parser/InlineParserContext.java +++ b/commonmark/src/main/java/org/commonmark/parser/InlineParserContext.java @@ -1,13 +1,25 @@ package org.commonmark.parser; +import org.commonmark.node.LinkReferenceDefinition; import org.commonmark.parser.delimiter.DelimiterProcessor; import java.util.List; -import java.util.Map; /** - * Parameter context for custom inline parser. + * Context for inline parsing. */ public interface InlineParserContext { + + /** + * @return custom delimiter processors that have been configured with {@link Parser.Builder#customDelimiterProcessor(DelimiterProcessor)} + */ List getCustomDelimiterProcessors(); + + /** + * Look up a {@link LinkReferenceDefinition} for a given label. + * + * @param label the link label to look up + * @return the definition if one exists, {@code null} otherwise + */ + LinkReferenceDefinition getLinkReferenceDefinition(String label); } diff --git a/commonmark/src/main/java/org/commonmark/parser/Parser.java b/commonmark/src/main/java/org/commonmark/parser/Parser.java index 04d28065f..5e15158ad 100644 --- a/commonmark/src/main/java/org/commonmark/parser/Parser.java +++ b/commonmark/src/main/java/org/commonmark/parser/Parser.java @@ -2,6 +2,7 @@ import org.commonmark.Extension; import org.commonmark.internal.DocumentParser; +import org.commonmark.internal.InlineParserContextImpl; import org.commonmark.internal.InlineParserImpl; import org.commonmark.node.*; import org.commonmark.parser.block.BlockParserFactory; @@ -10,6 +11,7 @@ import java.io.IOException; import java.io.Reader; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Set; @@ -32,12 +34,14 @@ public class Parser { private Parser(Builder builder) { this.blockParserFactories = DocumentParser.calculateBlockParserFactories(builder.blockParserFactories, builder.enabledBlockTypes); - this.inlineParserFactory = builder.inlineParserFactory; + this.inlineParserFactory = builder.getInlineParserFactory(); this.postProcessors = builder.postProcessors; this.delimiterProcessors = builder.delimiterProcessors; - // Try to construct an inline parser. This might raise exceptions in case of invalid configuration. - getInlineParser(); + // Try to construct an inline parser. Invalid configuration might result in an exception, which we want to + // detect as soon as possible. + this.inlineParserFactory.create(new InlineParserContextImpl(delimiterProcessors, + Collections.emptyMap())); } /** @@ -61,8 +65,7 @@ public Node parse(String input) { if (input == null) { throw new NullPointerException("input must not be null"); } - InlineParser inlineParser = getInlineParser(); - DocumentParser documentParser = new DocumentParser(blockParserFactories, inlineParser); + DocumentParser documentParser = createDocumentParser(); Node document = documentParser.parse(input); return postProcess(document); } @@ -89,19 +92,14 @@ public Node parseReader(Reader input) throws IOException { if (input == null) { throw new NullPointerException("input must not be null"); } - InlineParser inlineParser = getInlineParser(); - DocumentParser documentParser = new DocumentParser(blockParserFactories, inlineParser); + + DocumentParser documentParser = createDocumentParser(); Node document = documentParser.parse(input); return postProcess(document); } - private InlineParser getInlineParser() { - if (this.inlineParserFactory == null) { - return new InlineParserImpl(delimiterProcessors); - } else { - CustomInlineParserContext inlineParserContext = new CustomInlineParserContext(delimiterProcessors); - return this.inlineParserFactory.create(inlineParserContext); - } + private DocumentParser createDocumentParser() { + return new DocumentParser(blockParserFactories, inlineParserFactory, delimiterProcessors); } private Node postProcess(Node document) { @@ -111,20 +109,6 @@ private Node postProcess(Node document) { return document; } - private class CustomInlineParserContext implements InlineParserContext { - - private List delimiterProcessors; - - CustomInlineParserContext(List delimiterProcessors) { - this.delimiterProcessors = delimiterProcessors; - } - - @Override - public List getCustomDelimiterProcessors() { - return delimiterProcessors; - } - } - /** * Builder for configuring a {@link Parser}. */ @@ -133,7 +117,7 @@ public static class Builder { private final List delimiterProcessors = new ArrayList<>(); private final List postProcessors = new ArrayList<>(); private Set> enabledBlockTypes = DocumentParser.getDefaultBlockParserTypes(); - private InlineParserFactory inlineParserFactory = null; + private InlineParserFactory inlineParserFactory; /** * @return the configured {@link Parser} @@ -261,6 +245,18 @@ public Builder inlineParserFactory(InlineParserFactory inlineParserFactory) { this.inlineParserFactory = inlineParserFactory; return this; } + + private InlineParserFactory getInlineParserFactory() { + if (inlineParserFactory != null) { + return inlineParserFactory; + } + return new InlineParserFactory() { + @Override + public InlineParser create(InlineParserContext inlineParserContext) { + return new InlineParserImpl(inlineParserContext); + } + }; + } } /** From 0f4613add1a61dbd3737b7205dfb816eacfa649e Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Sat, 1 Jun 2019 20:26:19 +1000 Subject: [PATCH 10/22] Extract link scanning logic and replace use of regexes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hopefully this will allow us to reuse it for parsing link reference definitions in an incremental way. Also speeds up parsing a bit: -SpecBenchmark.parseExamples thrpt 50 453.493 ± 4.647 ops/s +SpecBenchmark.parseExamples thrpt 50 483.467 ± 3.418 ops/s --- .../commonmark/internal/InlineParserImpl.java | 120 +++++--------- .../commonmark/internal/util/Escaping.java | 11 +- .../commonmark/internal/util/LinkScanner.java | 148 ++++++++++++++++++ .../org/commonmark/internal/util/Parsing.java | 41 +++++ 4 files changed, 234 insertions(+), 86 deletions(-) create mode 100644 commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java index ad93b586e..00da9134d 100644 --- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java +++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java @@ -4,6 +4,7 @@ import org.commonmark.internal.inline.UnderscoreDelimiterProcessor; import org.commonmark.internal.util.Escaping; import org.commonmark.internal.util.Html5Entities; +import org.commonmark.internal.util.LinkScanner; import org.commonmark.internal.util.Parsing; import org.commonmark.node.*; import org.commonmark.parser.InlineParser; @@ -15,8 +16,7 @@ import java.util.regex.Pattern; public class InlineParserImpl implements InlineParser { - - private static final String ESCAPED_CHAR = "\\\\" + Escaping.ESCAPABLE; + private static final String HTMLCOMMENT = "|"; private static final String PROCESSINGINSTRUCTION = "[<][?].*?[?][>]"; private static final String DECLARATION = "]*>"; @@ -31,19 +31,6 @@ public class InlineParserImpl implements InlineParser { private static final Pattern HTML_TAG = Pattern.compile('^' + HTMLTAG, Pattern.CASE_INSENSITIVE); - private static final Pattern LINK_TITLE = Pattern.compile( - "^(?:\"(" + ESCAPED_CHAR + "|[^\"\\x00])*\"" + - '|' + - "'(" + ESCAPED_CHAR + "|[^'\\x00])*'" + - '|' + - "\\((" + ESCAPED_CHAR + "|[^()\\x00])*\\))"); - - private static final Pattern LINK_DESTINATION_BRACES = Pattern.compile( - "^(?:[<](?:[^<>\n\\\\\\x00]|\\\\.)*[>])"); - - private static final Pattern LINK_LABEL = Pattern.compile( - "^\\[(?:[^\\\\\\[\\]]|\\\\.){0,1000}\\]"); - private static final Pattern ESCAPABLE = Pattern.compile('^' + Escaping.ESCAPABLE); private static final Pattern ENTITY_HERE = Pattern.compile('^' + ENTITY, Pattern.CASE_INSENSITIVE); @@ -482,7 +469,8 @@ private Node parseCloseBracket() { // See if there's a link label like `[bar]` or `[]` int beforeLabel = index; - int labelLength = parseLinkLabel(); + parseLinkLabel(); + int labelLength = index - beforeLabel; String ref = null; if (labelLength > 2) { ref = input.substring(beforeLabel, beforeLabel + labelLength); @@ -559,92 +547,58 @@ private void removeLastBracket() { * Attempt to parse link destination, returning the string or null if no match. */ String parseLinkDestination() { - String res = match(LINK_DESTINATION_BRACES); - if (res != null) { // chop off surrounding <..>: - if (res.length() == 2) { - return ""; - } else { - return Escaping.unescapeString(res.substring(1, res.length() - 1)); - } - } else { - if (peek() == '<') { - return null; - } - int startIndex = index; - if (parseLinkDestinationWithBalancedParens()) { - return Escaping.unescapeString(input.substring(startIndex, index)); - } else { - return null; - } + int afterDest = LinkScanner.scanLinkDestination(input, index); + if (afterDest == -1) { + return null; } - } - private boolean parseLinkDestinationWithBalancedParens() { - int startIndex = index; - int parens = 0; - while (true) { - char c = peek(); - switch (c) { - case '\0': - case ' ': - return startIndex != index; - case '\\': - // check if we have an escapable character - if (index + 1 < input.length() && ESCAPABLE.matcher(input.substring(index + 1, index + 2)).matches()) { - // skip over the escaped character (after switch) - index++; - break; - } - // otherwise, we treat this as a literal backslash - break; - case '(': - parens++; - // Limit to 32 nested parens for pathological cases - if (parens > 32) { - return false; - } - break; - case ')': - if (parens == 0) { - return true; - } else { - parens--; - } - break; - default: - // or control character - if (Character.isISOControl(c)) { - return startIndex != index; - } - } - index++; + String dest; + if (peek() == '<') { + // chop off surrounding <..>: + dest = input.substring(index + 1, afterDest - 1); + } else { + dest = input.substring(index, afterDest); } + + index = afterDest; + return Escaping.unescapeString(dest); } /** * Attempt to parse link title (sans quotes), returning the string or null if no match. */ String parseLinkTitle() { - String title = match(LINK_TITLE); - if (title != null) { - // chop off quotes from title and unescape: - return Escaping.unescapeString(title.substring(1, title.length() - 1)); - } else { + int afterTitle = LinkScanner.scanLinkTitle(input, index); + if (afterTitle == -1) { return null; } + + // chop off ', " or parens + String title = input.substring(index + 1, afterTitle - 1); + index = afterTitle; + return Escaping.unescapeString(title); } /** * Attempt to parse a link label, returning number of characters parsed. */ int parseLinkLabel() { - String m = match(LINK_LABEL); - // Spec says "A link label can have at most 999 characters inside the square brackets" - if (m == null || m.length() > 1001) { + if (index >= input.length() || input.charAt(index) != '[') { + return 0; + } + + int startContent = index + 1; + int endContent = LinkScanner.scanLinkLabelContent(input, startContent); + // spec: A link label can have at most 999 characters inside the square brackets. + int contentLength = endContent - startContent; + if (endContent == -1 || contentLength > 999) { + return 0; + } + if (endContent >= input.length() || input.charAt(endContent) != ']') { return 0; - } else { - return m.length(); } + index = endContent + 1; + return contentLength + 2; } /** diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java b/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java index 9136b56f8..6a27f9419 100644 --- a/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java +++ b/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java @@ -109,9 +109,14 @@ public static String percentEncodeUrl(String s) { } public static String normalizeReference(String input) { - // Strip '[' and ']', then trim - String stripped = input.substring(1, input.length() - 1).trim(); - String lowercase = stripped.toLowerCase(Locale.ROOT); + // Strip '[' and ']' + String stripped = input.substring(1, input.length() - 1); + return normalizeLabelContent(stripped); + } + + public static String normalizeLabelContent(String input) { + String trimmed = input.trim(); + String lowercase = trimmed.toLowerCase(Locale.ROOT); return WHITESPACE.matcher(lowercase).replaceAll(" "); } diff --git a/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java b/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java new file mode 100644 index 000000000..dae17d090 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java @@ -0,0 +1,148 @@ +package org.commonmark.internal.util; + +public class LinkScanner { + + /** + * Attempt to scan the contents of a link label (inside the brackets), returning the position after the content or + * -1. The returned position can either be the closing {@code ]}, or the end of the line if the label continues on + * the next line. + */ + public static int scanLinkLabelContent(CharSequence input, int start) { + for (int i = start; i < input.length(); i++) { + char c = input.charAt(i); + switch (c) { + case '\\': + if (Parsing.isEscapable(input, i + 1)) { + i += 1; + } + break; + case ']': + return i; + case '[': + // spec: Unescaped square bracket characters are not allowed inside the opening and closing + // square brackets of link labels. + return -1; + } + } + return input.length(); + } + + /** + * Attempt to scan a link destination, returning the position after the destination or -1. + */ + public static int scanLinkDestination(CharSequence input, int start) { + if (start >= input.length()) { + return -1; + } + + if (input.charAt(start) == '<') { + for (int i = start + 1; i < input.length(); i++) { + char c = input.charAt(i); + switch (c) { + case '\\': + if (Parsing.isEscapable(input, i + 1)) { + i += 1; + } + break; + case '\n': + case '<': + return -1; + case '>': + return i + 1; + } + } + return -1; + } else { + return scanLinkDestinationWithBalancedParens(input, start); + } + } + + public static int scanLinkTitle(CharSequence input, int start) { + if (start >= input.length()) { + return -1; + } + + char endDelimiter; + switch (input.charAt(start)) { + case '"': + endDelimiter = '"'; + break; + case '\'': + endDelimiter = '\''; + break; + case '(': + endDelimiter = ')'; + break; + default: + return -1; + } + + int afterContent = scanLinkTitleContent(input, start, endDelimiter); + if (afterContent == -1) { + return -1; + } + + if (afterContent >= input.length() || input.charAt(afterContent) != endDelimiter) { + // missing or wrong end delimiter + return -1; + } + + return afterContent + 1; + } + + public static int scanLinkTitleContent(CharSequence input, int start, char endDelimiter) { + for (int i = start + 1; i < input.length(); i++) { + char c = input.charAt(i); + if (c == '\\' && Parsing.isEscapable(input, i + 1)) { + i += 1; + } else if (c == endDelimiter) { + return i; + } else if (endDelimiter == ')' && c == '(') { + // unescaped '(' in title within parens is invalid + return -1; + } + } + return input.length(); + } + + // spec: a nonempty sequence of characters that does not start with <, does not include ASCII space or control + // characters, and includes parentheses only if (a) they are backslash-escaped or (b) they are part of a balanced + // pair of unescaped parentheses + private static int scanLinkDestinationWithBalancedParens(CharSequence input, int start) { + int parens = 0; + for (int i = start; i < input.length(); i++) { + char c = input.charAt(i); + switch (c) { + case '\0': + case ' ': + return i != start ? i : -1; + case '\\': + if (Parsing.isEscapable(input, i + 1)) { + i += 1; + } + break; + case '(': + parens++; + // Limit to 32 nested parens for pathological cases + if (parens > 32) { + return -1; + } + break; + case ')': + if (parens == 0) { + return i; + } else { + parens--; + } + break; + default: + // or control character + if (Character.isISOControl(c)) { + return i != start ? i : -1; + } + break; + } + } + return input.length(); + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java b/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java index 94f77858d..d429d9db0 100644 --- a/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java +++ b/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java @@ -72,6 +72,47 @@ public static boolean isSpaceOrTab(CharSequence s, int index) { return false; } + public static boolean isEscapable(CharSequence s, int index) { + if (index < s.length()) { + switch (s.charAt(index)) { + case '!': + case '"': + case '#': + case '$': + case '%': + case '&': + case '\'': + case '(': + case ')': + case '*': + case '+': + case ',': + case '-': + case '.': + case '/': + case ':': + case ';': + case '<': + case '=': + case '>': + case '?': + case '@': + case '[': + case '\\': + case ']': + case '^': + case '_': + case '`': + case '{': + case '|': + case '}': + case '~': + return true; + } + } + return false; + } + /** * Prepares the input line replacing {@code \0} */ From 7fee1271d6e91804c7112d43cb075611231f8886 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Jul 2019 11:03:28 +1000 Subject: [PATCH 11/22] Make link reference definition parsing incremental MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old code would parse link reference definitions twice in the worst case. The new one parses it as part of paragraph parsing. Looks like this is faster too: -SpecBenchmark.parseExamples thrpt 50 485.743 ± 1.864 ops/s +SpecBenchmark.parseExamples thrpt 50 550.071 ± 8.638 ops/s -SpecBenchmark.parseWholeSpec thrpt 50 284.494 ± 2.641 ops/s +SpecBenchmark.parseWholeSpec thrpt 50 297.277 ± 3.272 ops/s --- .../commonmark/internal/DocumentParser.java | 54 +--- .../LinkReferenceDefinitionParser.java | 290 +++++++++++++----- .../OldLinkReferenceDefinitionParser.java | 124 ++++++++ .../commonmark/internal/ParagraphParser.java | 23 +- .../commonmark/internal/util/LinkScanner.java | 4 +- .../LinkReferenceDefinitionParserTest.java | 167 ++++++++++ 6 files changed, 534 insertions(+), 128 deletions(-) create mode 100644 commonmark/src/main/java/org/commonmark/internal/OldLinkReferenceDefinitionParser.java create mode 100644 commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java diff --git a/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java b/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java index b716286dc..886db2bf2 100644 --- a/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java @@ -64,7 +64,7 @@ public class DocumentParser implements ParserState { private final InlineParserFactory inlineParserFactory; private final List delimiterProcessors; private final DocumentBlockParser documentBlockParser; - private final LinkReferenceDefinitionParser linkReferenceDefinitionParser; + private final Map definitions = new LinkedHashMap<>(); private List activeBlockParsers = new ArrayList<>(); // LinkedHashSet to have a deterministic order @@ -77,7 +77,6 @@ public DocumentParser(List blockParserFactories, InlineParse this.delimiterProcessors = delimiterProcessors; this.documentBlockParser = new DocumentBlockParser(); - this.linkReferenceDefinitionParser = new LinkReferenceDefinitionParser(); activateBlockParser(this.documentBlockParser); } @@ -396,39 +395,25 @@ private void finalize(BlockParser blockParser) { if (blockParser instanceof ParagraphParser) { ParagraphParser paragraphParser = (ParagraphParser) blockParser; - String content = paragraphParser.getContentString(); - - Block paragraph = paragraphParser.getBlock(); - // TODO: Insert resulting nodes into AST (before paragraph node) - int afterDefinitions = linkReferenceDefinitionParser.parseDefinitions(content); - - if (afterDefinitions != 0) { - String remainingContent = content.substring(afterDefinitions); - if (Parsing.isBlank(remainingContent)) { - // If all we had was reference definitions, remove the block that's now empty - paragraph.unlink(); - paragraphParser.setContentString(""); - } else { - // We had some content after the definitions, use that for the paragraph - paragraphParser.setContentString(remainingContent); - } - } + addDefinitionsFrom(paragraphParser); } } - /** - * Walk through a block & children recursively, parsing string content into inline content where appropriate. - */ - private void processInlines() { - Map definitions = new LinkedHashMap<>(); - for (LinkReferenceDefinition definition : linkReferenceDefinitionParser.getDefinitions()) { + private void addDefinitionsFrom(ParagraphParser paragraphParser) { + for (LinkReferenceDefinition definition : paragraphParser.getDefinitions()) { String label = definition.getLabel(); // spec: When there are multiple matching link reference definitions, the first is used if (!definitions.containsKey(label)) { definitions.put(label, definition); } } + } + + /** + * Walk through a block & children recursively, parsing string content into inline content where appropriate. + */ + private void processInlines() { InlineParserContextImpl context = new InlineParserContextImpl(delimiterProcessors, definitions); InlineParser inlineParser = inlineParserFactory.create(context); @@ -467,13 +452,14 @@ private void prepareActiveBlockParserForReplacement() { allBlockParsers.remove(old); if (old instanceof ParagraphParser) { - String content = ((ParagraphParser) old).getContentString(); + ParagraphParser paragraphParser = (ParagraphParser) old; + // TODO: adjust comment? // Collect any link reference definitions. Note that replacing the active block parser is done after a // block parser got the current paragraph content using MatchedBlockParser#getContentString. In our // implementation of that, we strip link reference definitions from the paragraph content before we give it // to the block parser. We want to keep them. If no replacement happens, we collect the definitions as part // of finalizing paragraph blocks. - linkReferenceDefinitionParser.parseDefinitions(content); + addDefinitionsFrom(paragraphParser); } old.getBlock().unlink(); @@ -512,17 +498,9 @@ public BlockParser getMatchedBlockParser() { public CharSequence getParagraphContent() { if (matchedBlockParser instanceof ParagraphParser) { ParagraphParser paragraphParser = (ParagraphParser) matchedBlockParser; - String content = paragraphParser.getContentString(); - - // Strip link reference definitions, they are not going to be part of the paragraph text. - LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); - int afterDefinitions = parser.parseDefinitions(content); - if (afterDefinitions != 0) { - content = content.substring(afterDefinitions); - if (Parsing.isBlank(content)) { - // Paragraph consists only of link reference definitions -> no actual paragraph content - return null; - } + CharSequence content = paragraphParser.getContentString(); + if (content.length() == 0) { + return null; } return content; diff --git a/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java index 200f67d3c..e4ddfa8d0 100644 --- a/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java @@ -1,124 +1,254 @@ package org.commonmark.internal; import org.commonmark.internal.util.Escaping; +import org.commonmark.internal.util.LinkScanner; +import org.commonmark.internal.util.Parsing; import org.commonmark.node.LinkReferenceDefinition; -import org.commonmark.parser.delimiter.DelimiterProcessor; import java.util.ArrayList; -import java.util.Collections; import java.util.List; -import java.util.regex.Pattern; /** + * Parser for link reference definitions at the beginning of a paragraph. + * * @see Link reference definitions */ -class LinkReferenceDefinitionParser extends InlineParserImpl { +public class LinkReferenceDefinitionParser { - /** - * Parsed link reference definitions by label, in order of occurrence. - */ - private List definitions = new ArrayList<>(); + private State state = State.START_DEFINITION; - private static final Pattern LINE_END = Pattern.compile("^ *(?:\n|$)"); + private final StringBuilder paragraph = new StringBuilder(); + private final List definitions = new ArrayList<>(); - LinkReferenceDefinitionParser() { - // Not needed for parsing link reference definitions - super(new InlineParserContextImpl(Collections.emptyList(), - Collections.emptyMap())); + private StringBuilder label; + private String normalizedLabel; + private String destination; + private char titleDelimiter; + private StringBuilder title; + private boolean referenceValid = false; + + public void parse(CharSequence line) { + if (paragraph.length() != 0) { + paragraph.append('\n'); + } + paragraph.append(line); + + int i = 0; + while (i < line.length()) { + switch (state) { + case PARAGRAPH: { + // We're in a paragraph now. Link reference definitions can only appear at the beginning, so once + // we're in a paragraph, there's no going back. + return; + } + case START_DEFINITION: { + i = startDefinition(line, i); + break; + } + case LABEL: { + i = label(line, i); + break; + } + case DESTINATION: { + i = destination(line, i); + break; + } + case START_TITLE: { + i = startTitle(line, i); + break; + } + case TITLE: { + i = title(line, i); + break; + } + } + // -1 is returned if parsing failed, which means we fall back to treating text as a paragraph. + if (i == -1) { + state = State.PARAGRAPH; + return; + } + } + } + + CharSequence getParagraphContent() { + return paragraph; } - // TODO: Would be better to just return them from the method. List getDefinitions() { + finishReference(); return definitions; } - /** - * Parse all link reference definitions, add them to the map and return the length of the text we parsed (if any). - */ - int parseDefinitions(String content) { - int afterAllDefinitions = 0; - while (content.length() > 3 && content.charAt(0) == '[') { - int afterDefinition = parseDefinition(content); - if (afterDefinition != 0) { - content = content.substring(afterDefinition); - afterAllDefinitions += afterDefinition; - } else { - break; - } + State getState() { + return state; + } + + private int startDefinition(CharSequence line, int i) { + i = Parsing.skipSpaceTab(line, i, line.length()); + if (i >= line.length() || line.charAt(i) != '[') { + return -1; } - return afterAllDefinitions; + + state = State.LABEL; + label = new StringBuilder(); + + return i + 1; } - /** - * Attempt to parse a single link reference definition, adding it to the map. - */ - private int parseDefinition(String content) { - reset(content); - - String dest; - String title = null; - int matchChars; - int startIndex = index; - - // label: - matchChars = parseLinkLabel(); - if (matchChars == 0) { - return 0; + private int label(CharSequence line, int i) { + int afterLabel = LinkScanner.scanLinkLabelContent(line, i); + if (afterLabel == -1) { + return -1; } - String rawLabel = input.substring(0, matchChars); + label.append(line, i, afterLabel); + + if (afterLabel >= line.length()) { + // label might continue on next line + label.append('\n'); + return afterLabel; + } else if (line.charAt(afterLabel) == ']') { + int colon = afterLabel + 1; + // end of label + if (colon >= line.length() || line.charAt(colon) != ':') { + return -1; + } + + int afterSpace = Parsing.skipSpaceTab(line, colon + 1, line.length()); + + String normalizedLabel = Escaping.normalizeLabelContent(label.toString()); + if (normalizedLabel.isEmpty()) { + return -1; + } + + this.normalizedLabel = normalizedLabel; + state = State.DESTINATION; - // colon: - if (peek() != ':') { - return 0; + return afterSpace; + } else { + return -1; } - index++; + } - // link url - spnl(); + private int destination(CharSequence line, int i) { + i = Parsing.skipSpaceTab(line, i, line.length()); + int afterDestination = LinkScanner.scanLinkDestination(line, i); + if (afterDestination == -1) { + return -1; + } - dest = parseLinkDestination(); - if (dest == null) { - return 0; + destination = (line.charAt(i) == '<') + ? line.subSequence(i + 1, afterDestination - 1).toString() + : line.subSequence(i, afterDestination).toString(); + + int afterSpace = Parsing.skipSpaceTab(line, afterDestination, line.length()); + if (afterSpace >= line.length()) { + // Destination was at end of line, so this is a valid reference for sure (and maybe a title). + // If not at end of line, wait for title to be valid first. + referenceValid = true; + paragraph.setLength(0); + } else if (afterSpace == afterDestination) { + // spec: The title must be separated from the link destination by whitespace + return -1; } - int beforeTitle = index; - spnl(); - if (index != beforeTitle) { - title = parseLinkTitle(); + state = State.START_TITLE; + return afterSpace; + } + + private int startTitle(CharSequence line, int i) { + i = Parsing.skipSpaceTab(line, i, line.length()); + if (i >= line.length()) { + state = State.START_DEFINITION; + return i; } - if (title == null) { - // rewind before spaces - index = beforeTitle; + + titleDelimiter = '\0'; + char c = line.charAt(i); + switch (c) { + case '"': + case '\'': + titleDelimiter = c; + break; + case '(': + titleDelimiter = ')'; + break; } - boolean atLineEnd = true; - if (index != input.length() && match(LINE_END) == null) { - if (title == null) { - atLineEnd = false; - } else { - // the potential title we found is not at the line end, - // but it could still be a legal link reference if we - // discard the title - title = null; - // rewind before spaces - index = beforeTitle; - // and instead check if the link URL is at the line end - atLineEnd = match(LINE_END) != null; + if (titleDelimiter != '\0') { + state = State.TITLE; + title = new StringBuilder(); + i++; + if (i == line.length()) { + title.append('\n'); } + } else { + finishReference(); + // There might be another reference instead, try that for the same character. + state = State.START_DEFINITION; } + return i; + } - if (!atLineEnd) { - return 0; + private int title(CharSequence line, int i) { + int afterTitle = LinkScanner.scanLinkTitleContent(line, i, titleDelimiter); + if (afterTitle == -1) { + // Invalid title, stop + return -1; } - String normalizedLabel = Escaping.normalizeReference(rawLabel); - if (normalizedLabel.isEmpty()) { - return 0; + title.append(line.subSequence(i, afterTitle)); + + if (afterTitle >= line.length()) { + // Title still going, continue on next line + title.append('\n'); + return afterTitle; } - definitions.add(new LinkReferenceDefinition(normalizedLabel, dest, title)); + int afterTitleDelimiter = afterTitle + 1; + int afterSpace = Parsing.skipSpaceTab(line, afterTitleDelimiter, line.length()); + if (afterSpace != line.length()) { + // spec: No further non-whitespace characters may occur on the line. + return -1; + } + referenceValid = true; + finishReference(); + paragraph.setLength(0); + + // See if there's another definition. + state = State.START_DEFINITION; + return afterSpace; + } + + private void finishReference() { + if (!referenceValid) { + return; + } + + String d = Escaping.unescapeString(destination); + String t = title != null ? Escaping.unescapeString(title.toString()) : null; + definitions.add(new LinkReferenceDefinition(normalizedLabel, d, t)); + + label = null; + referenceValid = false; + normalizedLabel = null; + destination = null; + title = null; + } - return index - startIndex; + enum State { + // Looking for the start of a definition, i.e. `[` + START_DEFINITION, + // Parsing the label, i.e. `foo` within `[foo]` + LABEL, + // Parsing the destination, i.e. `/url` in `[foo]: /url` + DESTINATION, + // Looking for the start of a title, i.e. the first `"` in `[foo]: /url "title"` + START_TITLE, + // Parsing the content of the title, i.e. `title` in `[foo]: /url "title"` + TITLE, + + // End state, no matter what kind of lines we add, they won't be references + PARAGRAPH, } } diff --git a/commonmark/src/main/java/org/commonmark/internal/OldLinkReferenceDefinitionParser.java b/commonmark/src/main/java/org/commonmark/internal/OldLinkReferenceDefinitionParser.java new file mode 100644 index 000000000..fc01248f2 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/OldLinkReferenceDefinitionParser.java @@ -0,0 +1,124 @@ +package org.commonmark.internal; + +import org.commonmark.internal.util.Escaping; +import org.commonmark.node.LinkReferenceDefinition; +import org.commonmark.parser.delimiter.DelimiterProcessor; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.regex.Pattern; + +/** + * @see Link reference definitions + */ +class OldLinkReferenceDefinitionParser extends InlineParserImpl { + + /** + * Parsed link reference definitions by label, in order of occurrence. + */ + private List definitions = new ArrayList<>(); + + private static final Pattern LINE_END = Pattern.compile("^ *(?:\n|$)"); + + OldLinkReferenceDefinitionParser() { + // Not needed for parsing link reference definitions + super(new InlineParserContextImpl(Collections.emptyList(), + Collections.emptyMap())); + } + + // TODO: Would be better to just return them from the method. + List getDefinitions() { + return definitions; + } + + /** + * Parse all link reference definitions, add them to the map and return the length of the text we parsed (if any). + */ + int parseDefinitions(String content) { + int afterAllDefinitions = 0; + while (content.length() > 3 && content.charAt(0) == '[') { + int afterDefinition = parseDefinition(content); + if (afterDefinition != 0) { + content = content.substring(afterDefinition); + afterAllDefinitions += afterDefinition; + } else { + break; + } + } + return afterAllDefinitions; + } + + /** + * Attempt to parse a single link reference definition, adding it to the map. + */ + private int parseDefinition(String content) { + reset(content); + + String dest; + String title = null; + int matchChars; + int startIndex = index; + + // label: + matchChars = parseLinkLabel(); + if (matchChars == 0) { + return 0; + } + + String rawLabel = input.substring(0, matchChars); + + // colon: + if (peek() != ':') { + return 0; + } + index++; + + // link url + spnl(); + + dest = parseLinkDestination(); + if (dest == null) { + return 0; + } + + int beforeTitle = index; + spnl(); + if (index != beforeTitle) { + title = parseLinkTitle(); + } + if (title == null) { + // rewind before spaces + index = beforeTitle; + } + + boolean atLineEnd = true; + if (index != input.length() && match(LINE_END) == null) { + if (title == null) { + atLineEnd = false; + } else { + // the potential title we found is not at the line end, + // but it could still be a legal link reference if we + // discard the title + title = null; + // rewind before spaces + index = beforeTitle; + // and instead check if the link URL is at the line end + atLineEnd = match(LINE_END) != null; + } + } + + if (!atLineEnd) { + return 0; + } + + String normalizedLabel = Escaping.normalizeReference(rawLabel); + if (normalizedLabel.isEmpty()) { + return 0; + } + + definitions.add(new LinkReferenceDefinition(normalizedLabel, dest, title)); + + return index - startIndex; + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java b/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java index f6556ab2e..fc44cfd57 100644 --- a/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java @@ -1,16 +1,19 @@ package org.commonmark.internal; import org.commonmark.node.Block; +import org.commonmark.node.LinkReferenceDefinition; import org.commonmark.node.Paragraph; import org.commonmark.parser.InlineParser; import org.commonmark.parser.block.AbstractBlockParser; import org.commonmark.parser.block.BlockContinue; import org.commonmark.parser.block.ParserState; +import java.util.List; + public class ParagraphParser extends AbstractBlockParser { private final Paragraph block = new Paragraph(); - private BlockContent content = new BlockContent(); + private LinkReferenceDefinitionParser linkReferenceDefinitionParser = new LinkReferenceDefinitionParser(); @Override public Block getBlock() { @@ -28,25 +31,29 @@ public BlockContinue tryContinue(ParserState state) { @Override public void addLine(CharSequence line) { - content.add(line); + linkReferenceDefinitionParser.parse(line); } @Override public void closeBlock() { + if (linkReferenceDefinitionParser.getParagraphContent().length() == 0) { + block.unlink(); + } } @Override public void parseInlines(InlineParser inlineParser) { - if (content != null) { - inlineParser.parse(content.getString(), block); + CharSequence content = linkReferenceDefinitionParser.getParagraphContent(); + if (content.length() > 0) { + inlineParser.parse(content.toString(), block); } } - public String getContentString() { - return content.getString(); + public CharSequence getContentString() { + return linkReferenceDefinitionParser.getParagraphContent(); } - void setContentString(String contentString) { - content = new BlockContent(contentString); + public List getDefinitions() { + return linkReferenceDefinitionParser.getDefinitions(); } } diff --git a/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java b/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java index dae17d090..f25cd59e5 100644 --- a/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java +++ b/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java @@ -77,7 +77,7 @@ public static int scanLinkTitle(CharSequence input, int start) { return -1; } - int afterContent = scanLinkTitleContent(input, start, endDelimiter); + int afterContent = scanLinkTitleContent(input, start + 1, endDelimiter); if (afterContent == -1) { return -1; } @@ -91,7 +91,7 @@ public static int scanLinkTitle(CharSequence input, int start) { } public static int scanLinkTitleContent(CharSequence input, int start, char endDelimiter) { - for (int i = start + 1; i < input.length(); i++) { + for (int i = start; i < input.length(); i++) { char c = input.charAt(i); if (c == '\\' && Parsing.isEscapable(input, i + 1)) { i += 1; diff --git a/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java b/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java new file mode 100644 index 000000000..75696ba28 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java @@ -0,0 +1,167 @@ +package org.commonmark.internal; + +import org.commonmark.internal.LinkReferenceDefinitionParser.State; +import org.commonmark.node.LinkReferenceDefinition; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class LinkReferenceDefinitionParserTest { + + private LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + + @Test + public void testStartLabel() { + parser.parse("["); + assertEquals(State.LABEL, parser.getState()); + assertEquals("[", parser.getParagraphContent().toString()); + } + + @Test + public void testStartNoLabel() { + // Not a label + assertParagraph("a"); + // Can not go back to parsing link reference definitions + parser.parse("a"); + parser.parse("["); + assertEquals(State.PARAGRAPH, parser.getState()); + assertEquals("a\n[", parser.getParagraphContent().toString()); + } + + @Test + public void testEmptyLabel() { + assertParagraph("[]: /"); + assertParagraph("[ ]: /"); + assertParagraph("[ \t\n\u000B\f\r ]: /"); + } + + @Test + public void testLabelColon() { + assertParagraph("[foo] : /"); + } + + @Test + public void testLabel() { + assertState("[foo]:", State.DESTINATION, "[foo]:"); + assertState("[ foo ]:", State.DESTINATION, "[ foo ]:"); + } + + @Test + public void testLabelInvalid() { + assertParagraph("[foo[]:"); + } + + @Test + public void testLabelMultiline() { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse("[two"); + assertEquals(State.LABEL, parser.getState()); + parser.parse("lines]:"); + assertEquals(State.DESTINATION, parser.getState()); + parser.parse("/url"); + assertEquals(State.START_TITLE, parser.getState()); + assertDef(parser.getDefinitions().get(0), "two lines", "/url", null); + } + + @Test + public void testDestination() { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse("[foo]: /url"); + assertEquals(State.START_TITLE, parser.getState()); + assertEquals("", parser.getParagraphContent().toString()); + + assertEquals(1, parser.getDefinitions().size()); + assertDef(parser.getDefinitions().get(0), "foo", "/url", null); + + parser.parse("[bar]: "); + assertDef(parser.getDefinitions().get(1), "bar", "/url2", null); + } + + @Test + public void testDestinationInvalid() { + assertParagraph("[foo]: "); + } + + @Test + public void testTitle() { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse("[foo]: /url 'title'"); + assertEquals(State.START_DEFINITION, parser.getState()); + assertEquals("", parser.getParagraphContent().toString()); + + assertEquals(1, parser.getDefinitions().size()); + assertDef(parser.getDefinitions().get(0), "foo", "/url", "title"); + } + + @Test + public void testTitleStartWhitespace() { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse("[foo]: /url"); + assertEquals(State.START_TITLE, parser.getState()); + assertEquals("", parser.getParagraphContent().toString()); + + parser.parse(" "); + + assertEquals(State.START_DEFINITION, parser.getState()); + assertEquals(" ", parser.getParagraphContent().toString()); + + assertEquals(1, parser.getDefinitions().size()); + assertDef(parser.getDefinitions().get(0), "foo", "/url", null); + } + + @Test + public void testTitleMultiline() { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse("[foo]: /url 'two"); + assertEquals(State.TITLE, parser.getState()); + assertEquals("[foo]: /url 'two", parser.getParagraphContent().toString()); + assertEquals(0, parser.getDefinitions().size()); + + parser.parse("lines"); + assertEquals(State.TITLE, parser.getState()); + assertEquals("[foo]: /url 'two\nlines", parser.getParagraphContent().toString()); + assertEquals(0, parser.getDefinitions().size()); + + parser.parse("'"); + assertEquals(State.START_DEFINITION, parser.getState()); + assertEquals("", parser.getParagraphContent().toString()); + + assertEquals(1, parser.getDefinitions().size()); + assertDef(parser.getDefinitions().get(0), "foo", "/url", "two\nlines\n"); + } + + @Test + public void testTitleMultiline2() { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse("[foo]: /url '"); + assertEquals(State.TITLE, parser.getState()); + parser.parse("title'"); + assertEquals(State.START_DEFINITION, parser.getState()); + + assertDef(parser.getDefinitions().get(0), "foo", "/url", "\ntitle"); + } + + @Test + public void testTitleInvalid() { + assertParagraph("[foo]: /url (invalid("); + assertParagraph("[foo]: 'title'"); + assertParagraph("[foo]: /url 'title' INVALID"); + } + + private static void assertParagraph(String input) { + assertState(input, State.PARAGRAPH, input); + } + + private static void assertState(String input, State state, String paragraphContent) { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse(input); + assertEquals(state, parser.getState()); + assertEquals(paragraphContent, parser.getParagraphContent().toString()); + } + + private static void assertDef(LinkReferenceDefinition def, String label, String destination, String title) { + assertEquals(label, def.getLabel()); + assertEquals(destination, def.getDestination()); + assertEquals(title, def.getTitle()); + } +} From 320d57069da5d94a46f180e077a48496cbf130d6 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Jul 2019 14:26:20 +1000 Subject: [PATCH 12/22] Cleanup after extracting link reference definition parsing --- .../commonmark/internal/InlineParserImpl.java | 16 +-- .../OldLinkReferenceDefinitionParser.java | 124 ------------------ 2 files changed, 8 insertions(+), 132 deletions(-) delete mode 100644 commonmark/src/main/java/org/commonmark/internal/OldLinkReferenceDefinitionParser.java diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java index 00da9134d..7d6612c07 100644 --- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java +++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java @@ -16,7 +16,7 @@ import java.util.regex.Pattern; public class InlineParserImpl implements InlineParser { - + private static final String HTMLCOMMENT = "|"; private static final String PROCESSINGINSTRUCTION = "[<][?].*?[?][>]"; private static final String DECLARATION = "]*>"; @@ -58,8 +58,8 @@ public class InlineParserImpl implements InlineParser { private final Map delimiterProcessors; private final InlineParserContext context; - String input; - int index; + private String input; + private int index; /** * Top delimiter (emphasis, strong emphasis or custom emphasis). (Brackets are on a separate stack, different @@ -244,7 +244,7 @@ private Node parseInline(Node previous) { /** * If RE matches at current index in the input, advance index and return the match; otherwise return null. */ - String match(Pattern re) { + private String match(Pattern re) { if (index >= input.length()) { return null; } @@ -262,7 +262,7 @@ String match(Pattern re) { /** * Returns the char at the current input index, or {@code '\0'} in case there are no more characters. */ - char peek() { + private char peek() { if (index < input.length()) { return input.charAt(index); } else { @@ -273,7 +273,7 @@ char peek() { /** * Parse zero or more space characters, including at most one newline. */ - void spnl() { + private void spnl() { match(SPNL); } @@ -546,7 +546,7 @@ private void removeLastBracket() { /** * Attempt to parse link destination, returning the string or null if no match. */ - String parseLinkDestination() { + private String parseLinkDestination() { int afterDest = LinkScanner.scanLinkDestination(input, index); if (afterDest == -1) { return null; @@ -567,7 +567,7 @@ String parseLinkDestination() { /** * Attempt to parse link title (sans quotes), returning the string or null if no match. */ - String parseLinkTitle() { + private String parseLinkTitle() { int afterTitle = LinkScanner.scanLinkTitle(input, index); if (afterTitle == -1) { return null; diff --git a/commonmark/src/main/java/org/commonmark/internal/OldLinkReferenceDefinitionParser.java b/commonmark/src/main/java/org/commonmark/internal/OldLinkReferenceDefinitionParser.java deleted file mode 100644 index fc01248f2..000000000 --- a/commonmark/src/main/java/org/commonmark/internal/OldLinkReferenceDefinitionParser.java +++ /dev/null @@ -1,124 +0,0 @@ -package org.commonmark.internal; - -import org.commonmark.internal.util.Escaping; -import org.commonmark.node.LinkReferenceDefinition; -import org.commonmark.parser.delimiter.DelimiterProcessor; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.regex.Pattern; - -/** - * @see Link reference definitions - */ -class OldLinkReferenceDefinitionParser extends InlineParserImpl { - - /** - * Parsed link reference definitions by label, in order of occurrence. - */ - private List definitions = new ArrayList<>(); - - private static final Pattern LINE_END = Pattern.compile("^ *(?:\n|$)"); - - OldLinkReferenceDefinitionParser() { - // Not needed for parsing link reference definitions - super(new InlineParserContextImpl(Collections.emptyList(), - Collections.emptyMap())); - } - - // TODO: Would be better to just return them from the method. - List getDefinitions() { - return definitions; - } - - /** - * Parse all link reference definitions, add them to the map and return the length of the text we parsed (if any). - */ - int parseDefinitions(String content) { - int afterAllDefinitions = 0; - while (content.length() > 3 && content.charAt(0) == '[') { - int afterDefinition = parseDefinition(content); - if (afterDefinition != 0) { - content = content.substring(afterDefinition); - afterAllDefinitions += afterDefinition; - } else { - break; - } - } - return afterAllDefinitions; - } - - /** - * Attempt to parse a single link reference definition, adding it to the map. - */ - private int parseDefinition(String content) { - reset(content); - - String dest; - String title = null; - int matchChars; - int startIndex = index; - - // label: - matchChars = parseLinkLabel(); - if (matchChars == 0) { - return 0; - } - - String rawLabel = input.substring(0, matchChars); - - // colon: - if (peek() != ':') { - return 0; - } - index++; - - // link url - spnl(); - - dest = parseLinkDestination(); - if (dest == null) { - return 0; - } - - int beforeTitle = index; - spnl(); - if (index != beforeTitle) { - title = parseLinkTitle(); - } - if (title == null) { - // rewind before spaces - index = beforeTitle; - } - - boolean atLineEnd = true; - if (index != input.length() && match(LINE_END) == null) { - if (title == null) { - atLineEnd = false; - } else { - // the potential title we found is not at the line end, - // but it could still be a legal link reference if we - // discard the title - title = null; - // rewind before spaces - index = beforeTitle; - // and instead check if the link URL is at the line end - atLineEnd = match(LINE_END) != null; - } - } - - if (!atLineEnd) { - return 0; - } - - String normalizedLabel = Escaping.normalizeReference(rawLabel); - if (normalizedLabel.isEmpty()) { - return 0; - } - - definitions.add(new LinkReferenceDefinition(normalizedLabel, dest, title)); - - return index - startIndex; - } -} From bdbc9d3b563ec90fb580d7a94677206047ab6da7 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Jul 2019 14:49:14 +1000 Subject: [PATCH 13/22] Fix edge cases around link parsing --- .../internal/LinkReferenceDefinitionParser.java | 14 +++++++++++--- .../LinkReferenceDefinitionParserTest.java | 12 ++++++++++++ .../java/org/commonmark/test/SpecialInputTest.java | 5 +++-- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java index e4ddfa8d0..1fe2cbea7 100644 --- a/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java @@ -92,7 +92,12 @@ private int startDefinition(CharSequence line, int i) { state = State.LABEL; label = new StringBuilder(); - return i + 1; + int labelStart = i + 1; + if (labelStart >= line.length()) { + label.append('\n'); + } + + return labelStart; } private int label(CharSequence line, int i) { @@ -114,7 +119,10 @@ private int label(CharSequence line, int i) { return -1; } - int afterSpace = Parsing.skipSpaceTab(line, colon + 1, line.length()); + // spec: A link label can have at most 999 characters inside the square brackets. + if (label.length() > 999) { + return -1; + } String normalizedLabel = Escaping.normalizeLabelContent(label.toString()); if (normalizedLabel.isEmpty()) { @@ -124,7 +132,7 @@ private int label(CharSequence line, int i) { this.normalizedLabel = normalizedLabel; state = State.DESTINATION; - return afterSpace; + return Parsing.skipSpaceTab(line, colon + 1, line.length()); } else { return -1; } diff --git a/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java b/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java index 75696ba28..f0bdef492 100644 --- a/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java +++ b/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java @@ -63,6 +63,18 @@ public void testLabelMultiline() { assertDef(parser.getDefinitions().get(0), "two lines", "/url", null); } + @Test + public void testLabelStartsWithNewline() { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse("["); + assertEquals(State.LABEL, parser.getState()); + parser.parse("weird]:"); + assertEquals(State.DESTINATION, parser.getState()); + parser.parse("/url"); + assertEquals(State.START_TITLE, parser.getState()); + assertDef(parser.getDefinitions().get(0), "weird", "/url", null); + } + @Test public void testDestination() { LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); diff --git a/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java b/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java index 1550d3197..55ba71e2f 100644 --- a/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java +++ b/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java @@ -114,12 +114,13 @@ public void linkDestinationEscaping() { assertRendering("[foo](\\))", "

foo

\n"); // ` ` is not escapable, so the backslash is a literal backslash and there's an optional space at the end assertRendering("[foo](\\ )", "

foo

\n"); - // Backslash escapes `>`, so it's not a `(<...>)` link, but a `(...)` link instead - assertRendering("[foo](<\\>)", "

foo

\n"); // Backslash is a literal, so valid assertRendering("[foo]()", "

foo

\n"); // Backslash escapes `>` but there's another `>`, valid assertRendering("[foo](>)", "

foo

\n"); + + // This is a tricky one. There's `<` so we try to parse it as a `<` link but fail. + assertRendering("[foo](<\\>)", "

[foo](<>)

\n"); } // commonmark/CommonMark#468 From 48614f381bc532f185c9815bcbff6f248949186e Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Jul 2019 14:50:07 +1000 Subject: [PATCH 14/22] Disallow lists indented more than 3 spaces (spec 0.29) --- .../commonmark/internal/ListBlockParser.java | 2 +- .../org/commonmark/test/SpecialInputTest.java | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java b/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java index 28f9bfb0f..de1558f92 100644 --- a/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java @@ -205,7 +205,7 @@ public static class Factory extends AbstractBlockParserFactory { public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { BlockParser matched = matchedBlockParser.getMatchedBlockParser(); - if (state.getIndent() >= Parsing.CODE_BLOCK_INDENT && !(matched instanceof ListBlockParser)) { + if (state.getIndent() >= Parsing.CODE_BLOCK_INDENT) { return BlockStart.none(); } int markerIndex = state.getNextNonSpaceIndex(); diff --git a/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java b/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java index 55ba71e2f..c2bb9fd4c 100644 --- a/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java +++ b/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java @@ -139,4 +139,27 @@ public void linkReferenceBackslash() { public void emphasisMultipleOf3Rule() { assertRendering("a***b* c*", "

a*b c

\n"); } + + @Test + public void deeplyIndentedList() { + assertRendering("* one\n" + + " * two\n" + + " * three\n" + + " * four", + "
    \n" + + "
  • one\n" + + "
      \n" + + "
    • two\n" + + "
        \n" + + "
      • three\n" + + "
          \n" + + "
        • four
        • \n" + + "
        \n" + + "
      • \n" + + "
      \n" + + "
    • \n" + + "
    \n" + + "
  • \n" + + "
\n"); + } } From e760f758733e4c1e01fef0b6fa541ac46391054a Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Jul 2019 14:56:20 +1000 Subject: [PATCH 15/22] Adjust delimiter test to not use code block marker --- .../test/DelimiterProcessorTest.java | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/commonmark/src/test/java/org/commonmark/test/DelimiterProcessorTest.java b/commonmark/src/test/java/org/commonmark/test/DelimiterProcessorTest.java index e7348f5dd..948c484cd 100644 --- a/commonmark/src/test/java/org/commonmark/test/DelimiterProcessorTest.java +++ b/commonmark/src/test/java/org/commonmark/test/DelimiterProcessorTest.java @@ -52,18 +52,18 @@ public void asymmetricDelimiter() { @Test public void multipleDelimitersWithDifferentLengths() { Parser parser = Parser.builder() - .customDelimiterProcessor(new OneTildeDelimiterProcessor()) - .customDelimiterProcessor(new TwoTildesDelimiterProcessor()) + .customDelimiterProcessor(new OneDelimiterProcessor()) + .customDelimiterProcessor(new TwoDelimiterProcessor()) .build(); - assertEquals("

(1)one(/1) (2)two(/2)

\n", RENDERER.render(parser.parse("~one~ ~~two~~"))); - assertEquals("

(1)(2)both(/2)(/1)

\n", RENDERER.render(parser.parse("~~~both~~~"))); + assertEquals("

(1)one(/1) (2)two(/2)

\n", RENDERER.render(parser.parse("+one+ ++two++"))); + assertEquals("

(1)(2)both(/2)(/1)

\n", RENDERER.render(parser.parse("+++both+++"))); } @Test(expected = IllegalArgumentException.class) public void multipleDelimitersWithSameLength() { Parser.builder() - .customDelimiterProcessor(new OneTildeDelimiterProcessor()) - .customDelimiterProcessor(new OneTildeDelimiterProcessor()) + .customDelimiterProcessor(new OneDelimiterProcessor()) + .customDelimiterProcessor(new OneDelimiterProcessor()) .build(); } @@ -180,16 +180,16 @@ public void render(Node node) { } } - private static class OneTildeDelimiterProcessor implements DelimiterProcessor { + private static class OneDelimiterProcessor implements DelimiterProcessor { @Override public char getOpeningCharacter() { - return '~'; + return '+'; } @Override public char getClosingCharacter() { - return '~'; + return '+'; } @Override @@ -209,16 +209,16 @@ public void process(Text opener, Text closer, int delimiterUse) { } } - private static class TwoTildesDelimiterProcessor implements DelimiterProcessor { + private static class TwoDelimiterProcessor implements DelimiterProcessor { @Override public char getOpeningCharacter() { - return '~'; + return '+'; } @Override public char getClosingCharacter() { - return '~'; + return '+'; } @Override From 08b774819680fbc0b862e770567ac05c07209302 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Jul 2019 14:58:52 +1000 Subject: [PATCH 16/22] Adjust tests to spec changes --- .../java/org/commonmark/test/FencedCodeBlockParserTest.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/commonmark/src/test/java/org/commonmark/test/FencedCodeBlockParserTest.java b/commonmark/src/test/java/org/commonmark/test/FencedCodeBlockParserTest.java index 1a64a6374..774c6ff0e 100644 --- a/commonmark/src/test/java/org/commonmark/test/FencedCodeBlockParserTest.java +++ b/commonmark/src/test/java/org/commonmark/test/FencedCodeBlockParserTest.java @@ -26,14 +26,12 @@ public void backtickInfo() { public void backtickInfoDoesntAllowBacktick() { assertRendering("```info ` test\ncode\n```", "

```info ` test\ncode

\n
\n"); - // Note, it's unclear in the spec whether a ~~~ code block can contain ` in info or not, see: - // https://github.com/commonmark/CommonMark/issues/119 } @Test public void backtickAndTildeCantBeMixed() { assertRendering("``~`\ncode\n``~`", - "

~` code~`

\n"); + "

~` code ~`

\n"); } @Test From 490af42e6f7347be71acda12c6946020f9c8ddde Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Jul 2019 17:12:51 +1000 Subject: [PATCH 17/22] Update regression tests from commonmark.js --- .../src/main/resources/commonmark.js-regression.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/commonmark-test-util/src/main/resources/commonmark.js-regression.txt b/commonmark-test-util/src/main/resources/commonmark.js-regression.txt index a99620bb1..ec5143eff 100644 --- a/commonmark-test-util/src/main/resources/commonmark.js-regression.txt +++ b/commonmark-test-util/src/main/resources/commonmark.js-regression.txt @@ -147,3 +147,15 @@ Issue #289. .

[a](<b) c>

```````````````````````````````` + +Issue #161. + +```````````````````````````````` example +*failed to be italic!*\ +text +. +

failed to be italic!
+text

+```````````````````````````````` + + From ab7c1c7051b2f6a6521635b96830fe776ca89906 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Jul 2019 17:19:53 +1000 Subject: [PATCH 18/22] No longer treat as a block tag (spec 0.29) --- .../src/main/java/org/commonmark/internal/HtmlBlockParser.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/commonmark/src/main/java/org/commonmark/internal/HtmlBlockParser.java b/commonmark/src/main/java/org/commonmark/internal/HtmlBlockParser.java index c80822d3c..3b3a0e64f 100644 --- a/commonmark/src/main/java/org/commonmark/internal/HtmlBlockParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/HtmlBlockParser.java @@ -42,7 +42,7 @@ public class HtmlBlockParser extends AbstractBlockParser { "h1|h2|h3|h4|h5|h6|head|header|hr|html|" + "iframe|" + "legend|li|link|" + - "main|menu|menuitem|meta|" + + "main|menu|menuitem|" + "nav|noframes|" + "ol|optgroup|option|" + "p|param|" + From c5f5a348acd1c67b846355607f0c7d72c1d39558 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Thu, 11 Jul 2019 17:22:18 +1000 Subject: [PATCH 19/22] Don't preserve entities when rendering HTML attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/commonmark/commonmark.js/commit/c89b35c5fc99bdf1d2181f7f0c9fcb8a1abc27c8 Also replaced the regex for escaping with a loop, which speeds up HTML rendering: -SpecBenchmark.parseAndRenderExamples thrpt 50 344.820 ± 1.215 ops/s +SpecBenchmark.parseAndRenderExamples thrpt 50 374.342 ± 2.445 ops/s -SpecBenchmark.parseAndRenderWholeSpec thrpt 50 151.209 ± 1.148 ops/s +SpecBenchmark.parseAndRenderWholeSpec thrpt 50 198.357 ± 2.601 ops/s (Note that these benchmarks include parsing, so rendering itself saw a very nice improvement.) --- .../commonmark/internal/util/Escaping.java | 67 ++++++++++--------- .../commonmark/renderer/html/HtmlWriter.java | 6 +- .../internal/util/EscapingTest.java | 21 ++++++ .../org/commonmark/test/HtmlRendererTest.java | 9 +++ .../org/commonmark/test/RegressionTest.java | 20 +++++- 5 files changed, 87 insertions(+), 36 deletions(-) create mode 100644 commonmark/src/test/java/org/commonmark/internal/util/EscapingTest.java diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java b/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java index 6a27f9419..dbcdc4a5a 100644 --- a/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java +++ b/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java @@ -16,13 +16,6 @@ public class Escaping { private static final Pattern ENTITY_OR_ESCAPED_CHAR = Pattern.compile("\\\\" + ESCAPABLE + '|' + ENTITY, Pattern.CASE_INSENSITIVE); - private static final String XML_SPECIAL = "[&<>\"]"; - - private static final Pattern XML_SPECIAL_RE = Pattern.compile(XML_SPECIAL); - - private static final Pattern XML_SPECIAL_OR_ENTITY = - Pattern.compile(ENTITY + '|' + XML_SPECIAL, Pattern.CASE_INSENSITIVE); - // From RFC 3986 (see "reserved", "unreserved") except don't escape '[' or ']' to be compatible with JS encodeURI private static final Pattern ESCAPE_IN_URI = Pattern.compile("(%[a-fA-F0-9]{0,2}|[^:/?#@!$&'()*+,;=a-zA-Z0-9\\-._~])"); @@ -32,28 +25,6 @@ public class Escaping { private static final Pattern WHITESPACE = Pattern.compile("[ \t\r\n]+"); - private static final Replacer UNSAFE_CHAR_REPLACER = new Replacer() { - @Override - public void replace(String input, StringBuilder sb) { - switch (input) { - case "&": - sb.append("&"); - break; - case "<": - sb.append("<"); - break; - case ">": - sb.append(">"); - break; - case "\"": - sb.append("""); - break; - default: - sb.append(input); - } - } - }; - private static final Replacer UNESCAPE_REPLACER = new Replacer() { @Override public void replace(String input, StringBuilder sb) { @@ -88,9 +59,41 @@ public void replace(String input, StringBuilder sb) { } }; - public static String escapeHtml(String input, boolean preserveEntities) { - Pattern p = preserveEntities ? XML_SPECIAL_OR_ENTITY : XML_SPECIAL_RE; - return replaceAll(p, input, UNSAFE_CHAR_REPLACER); + public static String escapeHtml(String input) { + // Avoid building a new string in the majority of cases (nothing to escape) + StringBuilder sb = null; + + loop: + for (int i = 0; i < input.length(); i++) { + char c = input.charAt(i); + String replacement; + switch (c) { + case '&': + replacement = "&"; + break; + case '<': + replacement = "<"; + break; + case '>': + replacement = ">"; + break; + case '\"': + replacement = """; + break; + default: + if (sb != null) { + sb.append(c); + } + continue loop; + } + if (sb == null) { + sb = new StringBuilder(); + sb.append(input, 0, i); + } + sb.append(replacement); + } + + return sb != null ? sb.toString() : input; } /** diff --git a/commonmark/src/main/java/org/commonmark/renderer/html/HtmlWriter.java b/commonmark/src/main/java/org/commonmark/renderer/html/HtmlWriter.java index ec38e8f39..8c79eb8b4 100644 --- a/commonmark/src/main/java/org/commonmark/renderer/html/HtmlWriter.java +++ b/commonmark/src/main/java/org/commonmark/renderer/html/HtmlWriter.java @@ -25,7 +25,7 @@ public void raw(String s) { } public void text(String text) { - append(Escaping.escapeHtml(text, false)); + append(Escaping.escapeHtml(text)); } public void tag(String name) { @@ -42,9 +42,9 @@ public void tag(String name, Map attrs, boolean voidElement) { if (attrs != null && !attrs.isEmpty()) { for (Map.Entry attrib : attrs.entrySet()) { append(" "); - append(Escaping.escapeHtml(attrib.getKey(), true)); + append(Escaping.escapeHtml(attrib.getKey())); append("=\""); - append(Escaping.escapeHtml(attrib.getValue(), true)); + append(Escaping.escapeHtml(attrib.getValue())); append("\""); } } diff --git a/commonmark/src/test/java/org/commonmark/internal/util/EscapingTest.java b/commonmark/src/test/java/org/commonmark/internal/util/EscapingTest.java new file mode 100644 index 000000000..9433eb7d0 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/internal/util/EscapingTest.java @@ -0,0 +1,21 @@ +package org.commonmark.internal.util; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class EscapingTest { + + @Test + public void testEscapeHtml() { + assertEquals("nothing to escape", Escaping.escapeHtml("nothing to escape")); + assertEquals("&", Escaping.escapeHtml("&")); + assertEquals("<", Escaping.escapeHtml("<")); + assertEquals(">", Escaping.escapeHtml(">")); + assertEquals(""", Escaping.escapeHtml("\"")); + assertEquals("< start", Escaping.escapeHtml("< start")); + assertEquals("end >", Escaping.escapeHtml("end >")); + assertEquals("< both >", Escaping.escapeHtml("< both >")); + assertEquals("< middle & too >", Escaping.escapeHtml("< middle & too >")); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/HtmlRendererTest.java b/commonmark/src/test/java/org/commonmark/test/HtmlRendererTest.java index 6ccfe5465..30cbf24f3 100644 --- a/commonmark/src/test/java/org/commonmark/test/HtmlRendererTest.java +++ b/commonmark/src/test/java/org/commonmark/test/HtmlRendererTest.java @@ -50,6 +50,15 @@ public void textEscaping() { assertEquals("

escaping: & < > " '

\n", rendered); } + @Test + public void attributeEscaping() { + Paragraph paragraph = new Paragraph(); + Link link = new Link(); + link.setDestination(":"); + paragraph.appendChild(link); + assertEquals("

\n", defaultRenderer().render(paragraph)); + } + @Test public void percentEncodeUrlDisabled() { assertEquals("

a

\n", defaultRenderer().render(parse("[a](foo&bar)"))); diff --git a/commonmark/src/test/java/org/commonmark/test/RegressionTest.java b/commonmark/src/test/java/org/commonmark/test/RegressionTest.java index 5d49c2abd..c4a0d3be5 100644 --- a/commonmark/src/test/java/org/commonmark/test/RegressionTest.java +++ b/commonmark/src/test/java/org/commonmark/test/RegressionTest.java @@ -13,7 +13,9 @@ import java.net.URL; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; @RunWith(Parameterized.class) public class RegressionTest extends RenderingTestCase { @@ -22,6 +24,8 @@ public class RegressionTest extends RenderingTestCase { // The spec says URL-escaping is optional, but the examples assume that it's enabled. private static final HtmlRenderer RENDERER = HtmlRenderer.builder().percentEncodeUrls(true).build(); + private static final Map OVERRIDDEN_EXAMPLES = getOverriddenExamples(); + private final Example example; public RegressionTest(Example example) { @@ -42,11 +46,25 @@ public static List data() { @Test public void testHtmlRendering() { - assertRendering(example.getSource(), example.getHtml()); + String expectedHtml = OVERRIDDEN_EXAMPLES.get(example.getSource()); + if (expectedHtml == null) { + expectedHtml = example.getHtml(); + } + assertRendering(example.getSource(), expectedHtml); } @Override protected String render(String source) { return RENDERER.render(PARSER.parse(source)); } + + private static Map getOverriddenExamples() { + Map m = new HashMap<>(); + + // The only difference is that we don't change `%28` and `%29` to `(` and `)` (percent encoding is preserved) + m.put("[XSS](javascript&colon;alert%28'XSS'%29)\n", + "

XSS

\n"); + + return m; + } } From c8ccf85a7a0b57fa42e3ce02c44214aaefd3f1d7 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Fri, 12 Jul 2019 14:20:42 +1000 Subject: [PATCH 20/22] Fix strikethrough test by avoiding confusion with code block --- .../org/commonmark/ext/gfm/strikethrough/StrikethroughTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughTest.java b/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughTest.java index 85ae72eb2..225977854 100644 --- a/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughTest.java +++ b/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughTest.java @@ -44,7 +44,7 @@ public void unmatched() { @Test public void threeInnerThree() { - assertRendering("~~~foo~~~", "

~foo~

\n"); + assertRendering("a ~~~foo~~~", "

a ~foo~

\n"); } @Test From b1d8bb49cb1852822ab3e2e54473daa3887baf6a Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Fri, 12 Jul 2019 14:24:57 +1000 Subject: [PATCH 21/22] Adjust max length for decimal/numeric entities See commonmark/commonmark-spec#487 --- .../main/java/org/commonmark/internal/InlineParserImpl.java | 3 +-- .../src/main/java/org/commonmark/internal/util/Escaping.java | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java index 7d6612c07..38972cada 100644 --- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java +++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java @@ -23,7 +23,6 @@ public class InlineParserImpl implements InlineParser { private static final String CDATA = ""; private static final String HTMLTAG = "(?:" + Parsing.OPENTAG + "|" + Parsing.CLOSETAG + "|" + HTMLCOMMENT + "|" + PROCESSINGINSTRUCTION + "|" + DECLARATION + "|" + CDATA + ")"; - private static final String ENTITY = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"; private static final String ASCII_PUNCTUATION = "!\"#\\$%&'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}~"; private static final Pattern PUNCTUATION = Pattern @@ -33,7 +32,7 @@ public class InlineParserImpl implements InlineParser { private static final Pattern ESCAPABLE = Pattern.compile('^' + Escaping.ESCAPABLE); - private static final Pattern ENTITY_HERE = Pattern.compile('^' + ENTITY, Pattern.CASE_INSENSITIVE); + private static final Pattern ENTITY_HERE = Pattern.compile('^' + Escaping.ENTITY, Pattern.CASE_INSENSITIVE); private static final Pattern TICKS = Pattern.compile("`+"); diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java b/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java index dbcdc4a5a..15197556c 100644 --- a/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java +++ b/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java @@ -9,7 +9,7 @@ public class Escaping { public static final String ESCAPABLE = "[!\"#$%&\'()*+,./:;<=>?@\\[\\\\\\]^_`{|}~-]"; - private static final String ENTITY = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"; + public static final String ENTITY = "&(?:#x[a-f0-9]{1,6}|#[0-9]{1,7}|[a-z][a-z0-9]{1,31});"; private static final Pattern BACKSLASH_OR_AMP = Pattern.compile("[\\\\&]"); From 3011dcb7e1cbff761994e803696e01aa37c826d8 Mon Sep 17 00:00:00 2001 From: Robin Stocker Date: Fri, 12 Jul 2019 14:57:10 +1000 Subject: [PATCH 22/22] Adjust comment and remove TODO --- .../java/org/commonmark/internal/DocumentParser.java | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java b/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java index 886db2bf2..31716da88 100644 --- a/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java @@ -453,12 +453,11 @@ private void prepareActiveBlockParserForReplacement() { if (old instanceof ParagraphParser) { ParagraphParser paragraphParser = (ParagraphParser) old; - // TODO: adjust comment? // Collect any link reference definitions. Note that replacing the active block parser is done after a - // block parser got the current paragraph content using MatchedBlockParser#getContentString. In our - // implementation of that, we strip link reference definitions from the paragraph content before we give it - // to the block parser. We want to keep them. If no replacement happens, we collect the definitions as part - // of finalizing paragraph blocks. + // block parser got the current paragraph content using MatchedBlockParser#getContentString. In case the + // paragraph started with link reference definitions, we parse and strip them before the block parser gets + // the content. We want to keep them. + // If no replacement happens, we collect the definitions as part of finalizing paragraph blocks. addDefinitionsFrom(paragraphParser); }