diff --git a/CHANGELOG.md b/CHANGELOG.md index cb5f8c178..17e470514 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,12 @@ This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html with the exception that 0.x versions can break between minor versions. ## Unreleased +### Added +- `InlineParserContext.getLinkReferenceDefinition` was added to allow + custom inline parsers to look up definitions for reference links. ### Changed +- Link reference definition parsing has been changed according to the + spec: Definitions can now be in setext headings too. - Check non-null arguments early and provide a nicer message ## [0.12.1] - 2018-11-13 diff --git a/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughTest.java b/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughTest.java index 85ae72eb2..225977854 100644 --- a/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughTest.java +++ b/commonmark-ext-gfm-strikethrough/src/test/java/org/commonmark/ext/gfm/strikethrough/StrikethroughTest.java @@ -44,7 +44,7 @@ public void unmatched() { @Test public void threeInnerThree() { - assertRendering("~~~foo~~~", "

~foo~

\n"); + assertRendering("a ~~~foo~~~", "

a ~foo~

\n"); } @Test diff --git a/commonmark-test-util/src/main/resources/cmark-regression.txt b/commonmark-test-util/src/main/resources/cmark-regression.txt index 2984a3bef..62b1e7efe 100644 --- a/commonmark-test-util/src/main/resources/cmark-regression.txt +++ b/commonmark-test-util/src/main/resources/cmark-regression.txt @@ -4,8 +4,7 @@ Issue #113: EOL character weirdness on Windows (Important: first line ends with CR + CR + LF) ```````````````````````````````` example -line1 - +line1 line2 .

line1

@@ -82,7 +81,8 @@ Issue #193 - unescaped left angle brackets in link destination [a]: . -

a

+

[a]

+

[a]: <te

```````````````````````````````` Issue #192 - escaped spaces in link destination @@ -93,3 +93,64 @@ Issue #192 - escaped spaces in link destination .

[a](te\ st)

```````````````````````````````` + +Issue #527 - meta tags in inline contexts + +```````````````````````````````` example +City: + + + +. +

City: + + +

+```````````````````````````````` + +Issue #530 - link parsing corner cases + +```````````````````````````````` example +[a](\ b) + +[a](<[a](\ b)

+

[a](<<b)

+

[a](<b +)

+```````````````````````````````` + +Issue commonmark#526 - unescaped ( in link title + +```````````````````````````````` example +[link](url ((title)) +. +

[link](url ((title))

+```````````````````````````````` + +Issue commonamrk#517 - script, pre, style close tag without +opener. + +```````````````````````````````` example + + + + + +. + + + +```````````````````````````````` + +Issue #289. + +```````````````````````````````` example +[a]( +. +

[a](<b) c>

+```````````````````````````````` diff --git a/commonmark-test-util/src/main/resources/commonmark.js-regression.txt b/commonmark-test-util/src/main/resources/commonmark.js-regression.txt index 7300952fe..ec5143eff 100644 --- a/commonmark-test-util/src/main/resources/commonmark.js-regression.txt +++ b/commonmark-test-util/src/main/resources/commonmark.js-regression.txt @@ -15,10 +15,10 @@ bar Type 7 HTML block followed by whitespace (#98). ```````````````````````````````` example - + x . - + x ```````````````````````````````` @@ -95,10 +95,67 @@ Issue #116 - tabs before and after ATX closing heading

foo

```````````````````````````````` -commonmark/CommonMark#493 - escaped space not allowed in link -destination. +commonmark/CommonMark#493 - escaped space not allowed in link destination. + ```````````````````````````````` example [link](a\ b) .

[link](a\ b)

```````````````````````````````` + +Issue #527 - meta tags in inline contexts + +```````````````````````````````` example +City: + + + +. +

City: + + +

+```````````````````````````````` + +Double-encoding. + +```````````````````````````````` example +[XSS](javascript&colon;alert%28'XSS'%29) +. +

XSS

+```````````````````````````````` + +Issue commonamrk#517 - script, pre, style close tag without +opener. + +```````````````````````````````` example + + + + + +. + + + +```````````````````````````````` + +Issue #289. + +```````````````````````````````` example +[a]( +. +

[a](<b) c>

+```````````````````````````````` + +Issue #161. + +```````````````````````````````` example +*failed to be italic!*\ +text +. +

failed to be italic!
+text

+```````````````````````````````` + + diff --git a/commonmark-test-util/src/main/resources/spec.txt b/commonmark-test-util/src/main/resources/spec.txt index 9fd584139..3913de442 100644 --- a/commonmark-test-util/src/main/resources/spec.txt +++ b/commonmark-test-util/src/main/resources/spec.txt @@ -1,8 +1,8 @@ --- title: CommonMark Spec author: John MacFarlane -version: 0.28 -date: '2017-08-01' +version: 0.29 +date: '2019-04-06' license: '[CC-BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/)' ... @@ -248,7 +248,7 @@ satisfactory replacement for a spec. Because there is no unambiguous spec, implementations have diverged considerably. As a result, users are often surprised to find that -a document that renders one way on one system (say, a github wiki) +a document that renders one way on one system (say, a GitHub wiki) renders differently on another (say, converting to docbook using pandoc). To make matters worse, because nothing in Markdown counts as a "syntax error," the divergence often isn't discovered right away. @@ -328,8 +328,10 @@ that is not a [whitespace character]. An [ASCII punctuation character](@) is `!`, `"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`, -`*`, `+`, `,`, `-`, `.`, `/`, `:`, `;`, `<`, `=`, `>`, `?`, `@`, -`[`, `\`, `]`, `^`, `_`, `` ` ``, `{`, `|`, `}`, or `~`. +`*`, `+`, `,`, `-`, `.`, `/` (U+0021–2F), +`:`, `;`, `<`, `=`, `>`, `?`, `@` (U+003A–0040), +`[`, `\`, `]`, `^`, `_`, `` ` `` (U+005B–0060), +`{`, `|`, `}`, or `~` (U+007B–007E). A [punctuation character](@) is an [ASCII punctuation character] or anything in @@ -514,8 +516,8 @@ one block element does not affect the inline parsing of any other. ## Container blocks and leaf blocks We can divide blocks into two types: -[container block](@)s, -which can contain other blocks, and [leaf block](@)s, +[container blocks](@), +which can contain other blocks, and [leaf blocks](@), which cannot. # Leaf blocks @@ -527,7 +529,7 @@ Markdown document. A line consisting of 0-3 spaces of indentation, followed by a sequence of three or more matching `-`, `_`, or `*` characters, each followed -optionally by any number of spaces, forms a +optionally by any number of spaces or tabs, forms a [thematic break](@). ```````````````````````````````` example @@ -825,7 +827,7 @@ Contents are parsed as inlines: ```````````````````````````````` -Leading and trailing blanks are ignored in parsing inline content: +Leading and trailing [whitespace] is ignored in parsing inline content: ```````````````````````````````` example # foo @@ -1024,6 +1026,20 @@ baz* baz ```````````````````````````````` +The contents are the result of parsing the headings's raw +content as inlines. The heading's raw content is formed by +concatenating the lines and removing initial and final +[whitespace]. + +```````````````````````````````` example + Foo *bar +baz*→ +==== +. +

Foo bar +baz

+```````````````````````````````` + The underlining can be any length: @@ -1584,8 +1600,8 @@ begins with a code fence, indented no more than three spaces. The line with the opening code fence may optionally contain some text following the code fence; this is trimmed of leading and trailing -spaces and called the [info string](@). -The [info string] may not contain any backtick +whitespace and called the [info string](@). If the [info string] comes +after a backtick fence, it may not contain any backtick characters. (The reason for this restriction is that otherwise some inline code would be incorrectly interpreted as the beginning of a fenced code block.) @@ -1870,7 +1886,7 @@ Code fences (opening and closing) cannot contain internal spaces: ``` ``` aaa . -

+

aaa

```````````````````````````````` @@ -1922,9 +1938,11 @@ bar An [info string] can be provided after the opening code fence. -Opening and closing spaces will be stripped, and the first word, prefixed -with `language-`, is used as the value for the `class` attribute of the -`code` element within the enclosing `pre` element. +Although this spec doesn't mandate any particular treatment of +the info string, the first word is typically used to specify +the language of the code block. In HTML output, the language is +normally indicated by adding a class to the `code` element consisting +of `language-` followed by the language name. ```````````````````````````````` example ```ruby @@ -1973,6 +1991,18 @@ foo

```````````````````````````````` +[Info strings] for tilde code blocks can contain backticks and tildes: + +```````````````````````````````` example +~~~ aa ``` ~~~ +foo +~~~ +. +
foo
+
+```````````````````````````````` + + Closing code fences cannot have [info strings]: ```````````````````````````````` example @@ -1991,14 +2021,15 @@ Closing code fences cannot have [info strings]: An [HTML block](@) is a group of lines that is treated as raw HTML (and will not be escaped in HTML output). -There are seven kinds of [HTML block], which can be defined -by their start and end conditions. The block begins with a line that -meets a [start condition](@) (after up to three spaces -optional indentation). It ends with the first subsequent line that -meets a matching [end condition](@), or the last line of -the document or other [container block]), if no line is encountered that meets the -[end condition]. If the first line meets both the [start condition] -and the [end condition], the block will contain just that line. +There are seven kinds of [HTML block], which can be defined by their +start and end conditions. The block begins with a line that meets a +[start condition](@) (after up to three spaces optional indentation). +It ends with the first subsequent line that meets a matching [end +condition](@), or the last line of the document, or the last line of +the [container block](#container-blocks) containing the current HTML +block, if no line is encountered that meets the [end condition]. If +the first line meets both the [start condition] and the [end +condition], the block will contain just that line. 1. **Start condition:** line begins with the string ``, or @@ -2037,16 +2068,17 @@ the string `/>`.\ **End condition:** line is followed by a [blank line]. 7. **Start condition:** line begins with a complete [open tag] -or [closing tag] (with any [tag name] other than `script`, -`style`, or `pre`) followed only by [whitespace] -or the end of the line.\ +(with any [tag name] other than `script`, +`style`, or `pre`) or a complete [closing tag], +followed only by [whitespace] or the end of the line.\ **End condition:** line is followed by a [blank line]. HTML blocks continue until they are closed by their appropriate -[end condition], or the last line of the document or other [container block]. -This means any HTML **within an HTML block** that might otherwise be recognised -as a start condition will be ignored by the parser and passed through as-is, -without changing the parser's state. +[end condition], or the last line of the document or other [container +block](#container-blocks). This means any HTML **within an HTML +block** that might otherwise be recognised as a start condition will +be ignored by the parser and passed through as-is, without changing +the parser's state. For instance, `
` within a HTML block started by `` will not affect
 the parser state; as the HTML block was started in by start condition 6, it
@@ -2069,7 +2101,7 @@ _world_.
 
```````````````````````````````` -In this case, the HTML block is terminated by the newline — the `**hello**` +In this case, the HTML block is terminated by the newline — the `**Hello**` text remains verbatim — and regular parsing resumes, with a paragraph, emphasised `world` and inline and block HTML following. @@ -2612,7 +2644,8 @@ bar However, a following blank line is needed, except at the end of -a document, and except for blocks of types 1--5, above: +a document, and except for blocks of types 1--5, [above][HTML +block]: ```````````````````````````````` example
@@ -2758,8 +2791,8 @@ an indented code block: Fortunately, blank lines are usually not necessary and can be deleted. The exception is inside `
` tags, but as described
-above, raw HTML blocks starting with `
` *can* contain blank
-lines.
+[above][HTML blocks], raw HTML blocks starting with `
`
+*can* contain blank lines.
 
 ## Link reference definitions
 
@@ -2811,7 +2844,7 @@ them.
 
 ```````````````````````````````` example
 [Foo bar]:
-
+
 'title'
 
 [Foo bar]
@@ -2877,6 +2910,29 @@ The link destination may not be omitted:
 

[foo]

```````````````````````````````` + However, an empty link destination may be specified using + angle brackets: + +```````````````````````````````` example +[foo]: <> + +[foo] +. +

foo

+```````````````````````````````` + +The title must be separated from the link destination by +whitespace: + +```````````````````````````````` example +[foo]: (baz) + +[foo] +. +

[foo]: (baz)

+

[foo]

+```````````````````````````````` + Both title and destination can contain backslash escapes and literal backslashes: @@ -3034,6 +3090,25 @@ and thematic breaks, and it need not be followed by a blank line. ```````````````````````````````` +```````````````````````````````` example +[foo]: /url +bar +=== +[foo] +. +

bar

+

foo

+```````````````````````````````` + +```````````````````````````````` example +[foo]: /url +=== +[foo] +. +

=== +foo

+```````````````````````````````` + Several [link reference definitions] can occur one after another, without intervening blank lines. @@ -3070,6 +3145,17 @@ are defined: ```````````````````````````````` +Whether something is a [link reference definition] is +independent of whether the link reference it defines is +used in the document. Thus, for example, the following +document contains just a link reference definition, and +no visible content: + +```````````````````````````````` example +[foo]: /url +. +```````````````````````````````` + ## Paragraphs @@ -3207,7 +3293,7 @@ aaa # Container blocks -A [container block] is a block that has other +A [container block](#container-blocks) is a block that has other blocks as its contents. There are two basic kinds of container blocks: [block quotes] and [list items]. [Lists] are meta-containers for [list items]. @@ -3669,9 +3755,8 @@ in some browsers.) The following rules define [list items]: 1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of - blocks *Bs* starting with a [non-whitespace character] and not separated - from each other by more than one blank line, and *M* is a list - marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces, then the result + blocks *Bs* starting with a [non-whitespace character], and *M* is a + list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces, then the result of prepending *M* and the following spaces to the first line of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a list item with *Bs* as its contents. The type of the list item @@ -3981,8 +4066,7 @@ A start number may not be negative: 2. **Item starting with indented code.** If a sequence of lines *Ls* constitute a sequence of blocks *Bs* starting with an indented code - block and not separated from each other by more than one blank line, - and *M* is a list marker of width *W* followed by + block, and *M* is a list marker of width *W* followed by one space, then the result of prepending *M* and the following space to the first line of *Ls*, and indenting subsequent lines of *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents. @@ -4458,9 +4542,10 @@ continued here.

6. **That's all.** Nothing that is not counted as a list item by rules #1--5 counts as a [list item](#list-items). -The rules for sublists follow from the general rules above. A sublist -must be indented the same number of spaces a paragraph would need to be -in order to be included in the list item. +The rules for sublists follow from the general rules +[above][List items]. A sublist must be indented the same number +of spaces a paragraph would need to be in order to be included +in the list item. So, in this case we need two spaces indent: @@ -5049,11 +5134,9 @@ item: - b - c - d - - e - - f - - g - - h -- i + - e + - f +- g .
  • a
  • @@ -5063,8 +5146,6 @@ item:
  • e
  • f
  • g
  • -
  • h
  • -
  • i
```````````````````````````````` @@ -5074,7 +5155,7 @@ item: 2. b - 3. c + 3. c .
  1. @@ -5089,6 +5170,49 @@ item:
```````````````````````````````` +Note, however, that list items may not be indented more than +three spaces. Here `- e` is treated as a paragraph continuation +line, because it is indented more than three spaces: + +```````````````````````````````` example +- a + - b + - c + - d + - e +. +
    +
  • a
  • +
  • b
  • +
  • c
  • +
  • d +- e
  • +
+```````````````````````````````` + +And here, `3. c` is treated as in indented code block, +because it is indented four spaces and preceded by a +blank line. + +```````````````````````````````` example +1. a + + 2. b + + 3. c +. +
    +
  1. +

    a

    +
  2. +
  3. +

    b

    +
  4. +
+
3. c
+
+```````````````````````````````` + This is a loose list, because there is a blank line between two of the list items: @@ -5378,10 +5502,10 @@ Thus, for example, in

hilo`

```````````````````````````````` - `hi` is parsed as code, leaving the backtick at the end as a literal backtick. + ## Backslash escapes Any ASCII punctuation character may be backslash-escaped: @@ -5415,6 +5539,7 @@ not have their usual Markdown meanings: \* not a list \# not a heading \[foo]: /url "not a reference" +\ö not a character entity .

*not emphasized* <br/> not a tag @@ -5423,7 +5548,8 @@ not have their usual Markdown meanings: 1. not a list * not a list # not a heading -[foo]: /url "not a reference"

+[foo]: /url "not a reference" +&ouml; not a character entity

```````````````````````````````` @@ -5521,13 +5647,23 @@ foo ## Entity and numeric character references -All valid HTML entity references and numeric character -references, except those occuring in code blocks and code spans, -are recognized as such and treated as equivalent to the -corresponding Unicode characters. Conforming CommonMark parsers -need not store information about whether a particular character -was represented in the source using a Unicode character or -an entity reference. +Valid HTML entity references and numeric character references +can be used in place of the corresponding Unicode character, +with the following exceptions: + +- Entity and character references are not recognized in code + blocks and code spans. + +- Entity and character references cannot stand in place of + special characters that define structural elements in + CommonMark. For example, although `*` can be used + in place of a literal `*` character, `*` cannot replace + `*` in emphasis delimiters, bullet list markers, or thematic + breaks. + +Conforming CommonMark parsers need not store information about +whether a particular character was represented in the source +using a Unicode character or an entity reference. [Entity references](@) consist of `&` + any of the valid HTML5 entity names + `;`. The @@ -5548,22 +5684,22 @@ references and their corresponding code points. [Decimal numeric character references](@) -consist of `&#` + a string of 1--8 arabic digits + `;`. A +consist of `&#` + a string of 1--7 arabic digits + `;`. A numeric character reference is parsed as the corresponding Unicode character. Invalid Unicode code points will be replaced by the REPLACEMENT CHARACTER (`U+FFFD`). For security reasons, the code point `U+0000` will also be replaced by `U+FFFD`. ```````````````````````````````` example -# Ӓ Ϡ � � +# Ӓ Ϡ � . -

# Ӓ Ϡ � �

+

# Ӓ Ϡ �

```````````````````````````````` [Hexadecimal numeric character references](@) consist of `&#` + -either `X` or `x` + a string of 1-8 hexadecimal digits + `;`. +either `X` or `x` + a string of 1-6 hexadecimal digits + `;`. They too are parsed as the corresponding Unicode character (this time specified with a hexadecimal numeral instead of decimal). @@ -5578,9 +5714,13 @@ Here are some nonentities: ```````````````````````````````` example   &x; &#; &#x; +� +&#abcdef0; &ThisIsNotDefined; &hi?; .

&nbsp &x; &#; &#x; +&#987654321; +&#abcdef0; &ThisIsNotDefined; &hi?;

```````````````````````````````` @@ -5661,6 +5801,51 @@ text in code spans and code blocks: ```````````````````````````````` +Entity and numeric character references cannot be used +in place of symbols indicating structure in CommonMark +documents. + +```````````````````````````````` example +*foo* +*foo* +. +

*foo* +foo

+```````````````````````````````` + +```````````````````````````````` example +* foo + +* foo +. +

* foo

+
    +
  • foo
  • +
+```````````````````````````````` + +```````````````````````````````` example +foo bar +. +

foo + +bar

+```````````````````````````````` + +```````````````````````````````` example + foo +. +

→foo

+```````````````````````````````` + + +```````````````````````````````` example +[a](url "tit") +. +

[a](url "tit")

+```````````````````````````````` + + ## Code spans A [backtick string](@) @@ -5669,9 +5854,16 @@ preceded nor followed by a backtick. A [code span](@) begins with a backtick string and ends with a backtick string of equal length. The contents of the code span are -the characters between the two backtick strings, with leading and -trailing spaces and [line endings] removed, and -[whitespace] collapsed to single spaces. +the characters between the two backtick strings, normalized in the +following ways: + +- First, [line endings] are converted to [spaces]. +- If the resulting string both begins *and* ends with a [space] + character, but does not consist entirely of [space] + characters, a single [space] character is removed from the + front and back. This allows you to include code that begins + or ends with backtick characters, which must be separated by + whitespace from the opening or closing backtick strings. This is a simple code span: @@ -5683,10 +5875,11 @@ This is a simple code span: Here two backticks are used, because the code contains a backtick. -This example also illustrates stripping of leading and trailing spaces: +This example also illustrates stripping of a single leading and +trailing space: ```````````````````````````````` example -`` foo ` bar `` +`` foo ` bar `` .

foo ` bar

```````````````````````````````` @@ -5701,58 +5894,79 @@ spaces:

``

```````````````````````````````` +Note that only *one* space is stripped: -[Line endings] are treated like spaces: +```````````````````````````````` example +` `` ` +. +

``

+```````````````````````````````` + +The stripping only happens if the space is on both +sides of the string: ```````````````````````````````` example -`` -foo -`` +` a` . -

foo

+

a

```````````````````````````````` +Only [spaces], and not [unicode whitespace] in general, are +stripped in this way: + +```````````````````````````````` example +` b ` +. +

 b 

+```````````````````````````````` -Interior spaces and [line endings] are collapsed into -single spaces, just as they would be by a browser: +No stripping occurs if the code span contains only spaces: ```````````````````````````````` example -`foo bar - baz` +` ` +` ` . -

foo bar baz

+

  +

```````````````````````````````` -Not all [Unicode whitespace] (for instance, non-breaking space) is -collapsed, however: +[Line endings] are treated like spaces: ```````````````````````````````` example -`a  b` +`` +foo +bar +baz +`` . -

a  b

+

foo bar baz

```````````````````````````````` +```````````````````````````````` example +`` +foo +`` +. +

foo

+```````````````````````````````` -Q: Why not just leave the spaces, since browsers will collapse them -anyway? A: Because we might be targeting a non-HTML format, and we -shouldn't rely on HTML-specific rendering assumptions. -(Existing implementations differ in their treatment of internal -spaces and [line endings]. Some, including `Markdown.pl` and -`showdown`, convert an internal [line ending] into a -`
` tag. But this makes things difficult for those who like to -hard-wrap their paragraphs, since a line break in the midst of a code -span will cause an unintended line break in the output. Others just -leave internal spaces as they are, which is fine if only HTML is being -targeted.) +Interior spaces are not collapsed: ```````````````````````````````` example -`foo `` bar` +`foo bar +baz` . -

foo `` bar

+

foo bar baz

```````````````````````````````` +Note that browsers will typically collapse consecutive spaces +when rendering `` elements, so it is recommended that +the following CSS be used: + + code{white-space: pre-wrap;} + Note that backslash escapes do not work in code spans. All backslashes are treated literally: @@ -5768,6 +5982,19 @@ Backslash escapes are never needed, because one can always choose a string of *n* backtick characters as delimiters, where the code does not contain any strings of exactly *n* backtick characters. +```````````````````````````````` example +``foo`bar`` +. +

foo`bar

+```````````````````````````````` + +```````````````````````````````` example +` foo `` bar ` +. +

foo `` bar

+```````````````````````````````` + + Code span backticks have higher precedence than any other inline constructs except HTML tags and autolinks. Thus, for example, this is not parsed as emphasized text, since the second `*` is part of a code @@ -5905,15 +6132,17 @@ of one or more `_` characters that is not preceded or followed by a non-backslash-escaped `_` character. A [left-flanking delimiter run](@) is -a [delimiter run] that is (a) not followed by [Unicode whitespace], -and (b) not followed by a [punctuation character], or +a [delimiter run] that is (1) not followed by [Unicode whitespace], +and either (2a) not followed by a [punctuation character], or +(2b) followed by a [punctuation character] and preceded by [Unicode whitespace] or a [punctuation character]. For purposes of this definition, the beginning and the end of the line count as Unicode whitespace. A [right-flanking delimiter run](@) is -a [delimiter run] that is (a) not preceded by [Unicode whitespace], -and (b) not preceded by a [punctuation character], or +a [delimiter run] that is (1) not preceded by [Unicode whitespace], +and either (2a) not preceded by a [punctuation character], or +(2b) preceded by a [punctuation character] and followed by [Unicode whitespace] or a [punctuation character]. For purposes of this definition, the beginning and the end of the line count as Unicode whitespace. @@ -6005,7 +6234,8 @@ The following rules define emphasis and strong emphasis: [delimiter runs]. If one of the delimiters can both open and close emphasis, then the sum of the lengths of the delimiter runs containing the opening and closing delimiters - must not be a multiple of 3. + must not be a multiple of 3 unless both lengths are + multiples of 3. 10. Strong emphasis begins with a delimiter that [can open strong emphasis] and ends with a delimiter that @@ -6015,7 +6245,8 @@ The following rules define emphasis and strong emphasis: [delimiter runs]. If one of the delimiters can both open and close strong emphasis, then the sum of the lengths of the delimiter runs containing the opening and closing - delimiters must not be a multiple of 3. + delimiters must not be a multiple of 3 unless both lengths + are multiples of 3. 11. A literal `*` character cannot occur at the beginning or end of `*`-delimited emphasis or `**`-delimited strong emphasis, unless it @@ -6634,7 +6865,19 @@ is precluded by the condition that a delimiter that can both open and close (like the `*` after `foo`) cannot form emphasis if the sum of the lengths of the delimiter runs containing the opening and -closing delimiters is a multiple of 3. +closing delimiters is a multiple of 3 unless +both lengths are multiples of 3. + + +For the same reason, we don't get two consecutive +emphasis sections in this example: + +```````````````````````````````` example +*foo**bar* +. +

foo**bar

+```````````````````````````````` + The same condition ensures that the following cases are all strong emphasis nested inside @@ -6663,6 +6906,23 @@ omitted: ```````````````````````````````` +When the lengths of the interior closing and opening +delimiter runs are *both* multiples of 3, though, +they can match to create emphasis: + +```````````````````````````````` example +foo***bar***baz +. +

foobarbaz

+```````````````````````````````` + +```````````````````````````````` example +foo******bar*********baz +. +

foobar***baz

+```````````````````````````````` + + Indefinite levels of nesting are possible: ```````````````````````````````` example @@ -7198,15 +7458,16 @@ following rules apply: A [link destination](@) consists of either - a sequence of zero or more characters between an opening `<` and a - closing `>` that contains no spaces, line breaks, or unescaped + closing `>` that contains no line breaks or unescaped `<` or `>` characters, or -- a nonempty sequence of characters that does not include - ASCII space or control characters, and includes parentheses - only if (a) they are backslash-escaped or (b) they are part of - a balanced pair of unescaped parentheses. (Implementations - may impose limits on parentheses nesting to avoid performance - issues, but at least three levels of nesting should be supported.) +- a nonempty sequence of characters that does not start with + `<`, does not include ASCII space or control characters, and + includes parentheses only if (a) they are backslash-escaped or + (b) they are part of a balanced pair of unescaped parentheses. + (Implementations may impose limits on parentheses nesting to + avoid performance issues, but at least three levels of nesting + should be supported.) A [link title](@) consists of either @@ -7219,7 +7480,8 @@ A [link title](@) consists of either backslash-escaped, or - a sequence of zero or more characters between matching parentheses - (`(...)`), including a `)` character only if it is backslash-escaped. + (`(...)`), including a `(` or `)` character only if it is + backslash-escaped. Although [link titles] may span multiple lines, they may not contain a [blank line]. @@ -7269,9 +7531,8 @@ Both the title and the destination may be omitted:

link

```````````````````````````````` - -The destination cannot contain spaces or line breaks, -even if enclosed in pointy brackets: +The destination can only contain spaces if it is +enclosed in pointy brackets: ```````````````````````````````` example [link](/my uri) @@ -7279,13 +7540,14 @@ even if enclosed in pointy brackets:

[link](/my uri)

```````````````````````````````` - ```````````````````````````````` example [link](
) . -

[link](</my uri>)

+

link

```````````````````````````````` +The destination cannot contain line breaks, +even if enclosed in pointy brackets: ```````````````````````````````` example [link](foo @@ -7295,7 +7557,6 @@ bar) bar)

```````````````````````````````` - ```````````````````````````````` example [link]() @@ -7304,6 +7565,36 @@ bar>) bar>)

```````````````````````````````` +The destination can contain `)` if it is enclosed +in pointy brackets: + +```````````````````````````````` example +[a]() +. +

a

+```````````````````````````````` + +Pointy brackets that enclose links must be unescaped: + +```````````````````````````````` example +[link]() +. +

[link](<foo>)

+```````````````````````````````` + +These are not links, because the opening pointy bracket +is not matched properly: + +```````````````````````````````` example +[a]( +[a](c) +. +

[a](<b)c +[a](<b)c> +[a](c)

+```````````````````````````````` + Parentheses inside the link destination may be escaped: ```````````````````````````````` example @@ -8411,7 +8702,7 @@ If you want a link after a literal `!`, backslash-escape the as the link label. A [URI autolink](@) consists of `<`, followed by an -[absolute URI] not containing `<`, followed by `>`. It is parsed as +[absolute URI] followed by `>`. It is parsed as a link to the URI, with the URI as the link's label. An [absolute URI](@), @@ -8624,7 +8915,7 @@ a [single-quoted attribute value], or a [double-quoted attribute value]. An [unquoted attribute value](@) is a nonempty string of characters not -including spaces, `"`, `'`, `=`, `<`, `>`, or `` ` ``. +including [whitespace], `"`, `'`, `=`, `<`, `>`, or `` ` ``. A [single-quoted attribute value](@) consists of `'`, zero or more @@ -8745,9 +9036,13 @@ Illegal [whitespace]: ```````````````````````````````` example < a>< foo> + .

< a>< -foo><bar/ >

+foo><bar/ > +<foo bar=baz +bim!bop />

```````````````````````````````` @@ -8944,10 +9239,10 @@ bar

Line breaks do not occur inside code spans ```````````````````````````````` example -`code +`code span` . -

code span

+

code span

```````````````````````````````` @@ -9365,7 +9660,8 @@ just above `stack_bottom` (or the first element if `stack_bottom` is NULL). We keep track of the `openers_bottom` for each delimiter -type (`*`, `_`). Initialize this to `stack_bottom`. +type (`*`, `_`) and each length of the closing delimiter run +(modulo 3). Initialize this to `stack_bottom`. Then we repeat the following until we run out of potential closers: @@ -9397,7 +9693,7 @@ closers: of the delimiter stack. If the closing node is removed, reset `current_position` to the next element in the stack. -- If none in found: +- If none is found: + Set `openers_bottom` to the element before `current_position`. (We know that there are no openers for this kind of closer up to and diff --git a/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java b/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java index 539516c1d..31716da88 100644 --- a/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/DocumentParser.java @@ -3,7 +3,9 @@ import org.commonmark.internal.util.Parsing; import org.commonmark.node.*; import org.commonmark.parser.InlineParser; +import org.commonmark.parser.InlineParserFactory; import org.commonmark.parser.block.*; +import org.commonmark.parser.delimiter.DelimiterProcessor; import java.io.BufferedReader; import java.io.IOException; @@ -59,15 +61,20 @@ public class DocumentParser implements ParserState { private boolean blank; private final List blockParserFactories; - private final InlineParser inlineParser; + private final InlineParserFactory inlineParserFactory; + private final List delimiterProcessors; private final DocumentBlockParser documentBlockParser; + private final Map definitions = new LinkedHashMap<>(); private List activeBlockParsers = new ArrayList<>(); - private Set allBlockParsers = new HashSet<>(); + // LinkedHashSet to have a deterministic order + private Set allBlockParsers = new LinkedHashSet<>(); - public DocumentParser(List blockParserFactories, InlineParser inlineParser) { + public DocumentParser(List blockParserFactories, InlineParserFactory inlineParserFactory, + List delimiterProcessors) { this.blockParserFactories = blockParserFactories; - this.inlineParser = inlineParser; + this.inlineParserFactory = inlineParserFactory; + this.delimiterProcessors = delimiterProcessors; this.documentBlockParser = new DocumentBlockParser(); activateBlockParser(this.documentBlockParser); @@ -233,7 +240,7 @@ private void incorporateLine(CharSequence ln) { } if (blockStart.isReplaceActiveBlockParser()) { - removeActiveBlockParser(); + prepareActiveBlockParserForReplacement(); } for (BlockParser newBlockParser : blockStart.getBlockParsers()) { @@ -386,10 +393,20 @@ private void finalize(BlockParser blockParser) { blockParser.closeBlock(); - if (blockParser instanceof ParagraphParser - && inlineParser instanceof ReferenceParser) { + if (blockParser instanceof ParagraphParser) { ParagraphParser paragraphParser = (ParagraphParser) blockParser; - paragraphParser.closeBlock((ReferenceParser) inlineParser); + // TODO: Insert resulting nodes into AST (before paragraph node) + addDefinitionsFrom(paragraphParser); + } + } + + private void addDefinitionsFrom(ParagraphParser paragraphParser) { + for (LinkReferenceDefinition definition : paragraphParser.getDefinitions()) { + String label = definition.getLabel(); + // spec: When there are multiple matching link reference definitions, the first is used + if (!definitions.containsKey(label)) { + definitions.put(label, definition); + } } } @@ -397,6 +414,9 @@ private void finalize(BlockParser blockParser) { * Walk through a block & children recursively, parsing string content into inline content where appropriate. */ private void processInlines() { + InlineParserContextImpl context = new InlineParserContextImpl(delimiterProcessors, definitions); + InlineParser inlineParser = inlineParserFactory.create(context); + for (BlockParser blockParser : allBlockParsers) { blockParser.parseInlines(inlineParser); } @@ -426,11 +446,21 @@ private void deactivateBlockParser() { activeBlockParsers.remove(activeBlockParsers.size() - 1); } - private void removeActiveBlockParser() { + private void prepareActiveBlockParserForReplacement() { BlockParser old = getActiveBlockParser(); deactivateBlockParser(); allBlockParsers.remove(old); + if (old instanceof ParagraphParser) { + ParagraphParser paragraphParser = (ParagraphParser) old; + // Collect any link reference definitions. Note that replacing the active block parser is done after a + // block parser got the current paragraph content using MatchedBlockParser#getContentString. In case the + // paragraph started with link reference definitions, we parse and strip them before the block parser gets + // the content. We want to keep them. + // If no replacement happens, we collect the definitions as part of finalizing paragraph blocks. + addDefinitionsFrom(paragraphParser); + } + old.getBlock().unlink(); } @@ -467,7 +497,12 @@ public BlockParser getMatchedBlockParser() { public CharSequence getParagraphContent() { if (matchedBlockParser instanceof ParagraphParser) { ParagraphParser paragraphParser = (ParagraphParser) matchedBlockParser; - return paragraphParser.getContentString(); + CharSequence content = paragraphParser.getContentString(); + if (content.length() == 0) { + return null; + } + + return content; } return null; } diff --git a/commonmark/src/main/java/org/commonmark/internal/FencedCodeBlockParser.java b/commonmark/src/main/java/org/commonmark/internal/FencedCodeBlockParser.java index 6352892cf..e57cc7277 100644 --- a/commonmark/src/main/java/org/commonmark/internal/FencedCodeBlockParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/FencedCodeBlockParser.java @@ -102,17 +102,13 @@ private static FencedCodeBlockParser checkOpener(CharSequence line, int index, i } } if (backticks >= 3 && tildes == 0) { - // spec: The info string may not contain any backtick characters. + // spec: If the info string comes after a backtick fence, it may not contain any backtick characters. if (Parsing.find('`', line, index + backticks) != -1) { return null; } return new FencedCodeBlockParser('`', backticks, indent); } else if (tildes >= 3 && backticks == 0) { - // This follows commonmark.js but the spec is unclear about this: - // https://github.com/commonmark/CommonMark/issues/119 - if (Parsing.find('~', line, index + tildes) != -1) { - return null; - } + // spec: Info strings for tilde code blocks can contain backticks and tildes return new FencedCodeBlockParser('~', tildes, indent); } else { return null; diff --git a/commonmark/src/main/java/org/commonmark/internal/HtmlBlockParser.java b/commonmark/src/main/java/org/commonmark/internal/HtmlBlockParser.java index c80822d3c..3b3a0e64f 100644 --- a/commonmark/src/main/java/org/commonmark/internal/HtmlBlockParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/HtmlBlockParser.java @@ -42,7 +42,7 @@ public class HtmlBlockParser extends AbstractBlockParser { "h1|h2|h3|h4|h5|h6|head|header|hr|html|" + "iframe|" + "legend|li|link|" + - "main|menu|menuitem|meta|" + + "main|menu|menuitem|" + "nav|noframes|" + "ol|optgroup|option|" + "p|param|" + diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserContextImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserContextImpl.java new file mode 100644 index 000000000..bff085ad8 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserContextImpl.java @@ -0,0 +1,30 @@ +package org.commonmark.internal; + +import org.commonmark.node.LinkReferenceDefinition; +import org.commonmark.parser.InlineParserContext; +import org.commonmark.parser.delimiter.DelimiterProcessor; + +import java.util.List; +import java.util.Map; + +public class InlineParserContextImpl implements InlineParserContext { + + private final List delimiterProcessors; + private final Map linkReferenceDefinitions; + + public InlineParserContextImpl(List delimiterProcessors, + Map linkReferenceDefinitions) { + this.delimiterProcessors = delimiterProcessors; + this.linkReferenceDefinitions = linkReferenceDefinitions; + } + + @Override + public List getCustomDelimiterProcessors() { + return delimiterProcessors; + } + + @Override + public LinkReferenceDefinition getLinkReferenceDefinition(String label) { + return linkReferenceDefinitions.get(label); + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java index c6bbacc1a..38972cada 100644 --- a/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java +++ b/commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java @@ -4,25 +4,25 @@ import org.commonmark.internal.inline.UnderscoreDelimiterProcessor; import org.commonmark.internal.util.Escaping; import org.commonmark.internal.util.Html5Entities; +import org.commonmark.internal.util.LinkScanner; import org.commonmark.internal.util.Parsing; import org.commonmark.node.*; import org.commonmark.parser.InlineParser; +import org.commonmark.parser.InlineParserContext; import org.commonmark.parser.delimiter.DelimiterProcessor; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; -public class InlineParserImpl implements InlineParser, ReferenceParser { +public class InlineParserImpl implements InlineParser { - private static final String ESCAPED_CHAR = "\\\\" + Escaping.ESCAPABLE; private static final String HTMLCOMMENT = "|"; private static final String PROCESSINGINSTRUCTION = "[<][?].*?[?][>]"; private static final String DECLARATION = "]*>"; private static final String CDATA = ""; private static final String HTMLTAG = "(?:" + Parsing.OPENTAG + "|" + Parsing.CLOSETAG + "|" + HTMLCOMMENT + "|" + PROCESSINGINSTRUCTION + "|" + DECLARATION + "|" + CDATA + ")"; - private static final String ENTITY = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"; private static final String ASCII_PUNCTUATION = "!\"#\\$%&'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}~"; private static final Pattern PUNCTUATION = Pattern @@ -30,20 +30,9 @@ public class InlineParserImpl implements InlineParser, ReferenceParser { private static final Pattern HTML_TAG = Pattern.compile('^' + HTMLTAG, Pattern.CASE_INSENSITIVE); - private static final Pattern LINK_TITLE = Pattern.compile( - "^(?:\"(" + ESCAPED_CHAR + "|[^\"\\x00])*\"" + - '|' + - "'(" + ESCAPED_CHAR + "|[^'\\x00])*'" + - '|' + - "\\((" + ESCAPED_CHAR + "|[^)\\x00])*\\))"); - - private static final Pattern LINK_DESTINATION_BRACES = Pattern.compile("^(?:[<](?:[^<> \\t\\n\\\\]|\\\\.)*[>])"); - - private static final Pattern LINK_LABEL = Pattern.compile("^\\[(?:[^\\\\\\[\\]]|\\\\.)*\\]"); - private static final Pattern ESCAPABLE = Pattern.compile('^' + Escaping.ESCAPABLE); - private static final Pattern ENTITY_HERE = Pattern.compile('^' + ENTITY, Pattern.CASE_INSENSITIVE); + private static final Pattern ENTITY_HERE = Pattern.compile('^' + Escaping.ENTITY, Pattern.CASE_INSENSITIVE); private static final Pattern TICKS = Pattern.compile("`+"); @@ -63,16 +52,10 @@ public class InlineParserImpl implements InlineParser, ReferenceParser { private static final Pattern FINAL_SPACE = Pattern.compile(" *$"); - private static final Pattern LINE_END = Pattern.compile("^ *(?:\n|$)"); - private final BitSet specialCharacters; private final BitSet delimiterCharacters; private final Map delimiterProcessors; - - /** - * Link references by ID, needs to be built up using parseReference before calling parse. - */ - private Map referenceMap = new HashMap<>(); + private final InlineParserContext context; private String input; private int index; @@ -88,10 +71,12 @@ public class InlineParserImpl implements InlineParser, ReferenceParser { */ private Bracket lastBracket; - public InlineParserImpl(List delimiterProcessors) { - this.delimiterProcessors = calculateDelimiterProcessors(delimiterProcessors); + public InlineParserImpl(InlineParserContext inlineParserContext) { + this.delimiterProcessors = calculateDelimiterProcessors(inlineParserContext.getCustomDelimiterProcessors()); this.delimiterCharacters = calculateDelimiterCharacters(this.delimiterProcessors.keySet()); this.specialCharacters = calculateSpecialCharacters(delimiterCharacters); + + this.context = inlineParserContext; } public static BitSet calculateDelimiterCharacters(Set characters) { @@ -161,10 +146,7 @@ private static void addDelimiterProcessorForChar(char delimiterChar, DelimiterPr */ @Override public void parse(String content, Node block) { - this.input = content.trim(); - this.index = 0; - this.lastDelimiter = null; - this.lastBracket = null; + reset(content.trim()); Node previous = null; while (true) { @@ -181,80 +163,14 @@ public void parse(String content, Node block) { mergeChildTextNodes(block); } - /** - * Attempt to parse a link reference, modifying the internal reference map. - */ - @Override - public int parseReference(String s) { - this.input = s; + void reset(String content) { + this.input = content; this.index = 0; - String dest; - String title; - int matchChars; - int startIndex = index; - - // label: - matchChars = parseLinkLabel(); - if (matchChars == 0) { - return 0; - } - - String rawLabel = input.substring(0, matchChars); - - // colon: - if (peek() != ':') { - return 0; - } - index++; - - // link url - spnl(); - - dest = parseLinkDestination(); - if (dest == null || dest.length() == 0) { - return 0; - } - - int beforeTitle = index; - spnl(); - title = parseLinkTitle(); - if (title == null) { - // rewind before spaces - index = beforeTitle; - } - - boolean atLineEnd = true; - if (index != input.length() && match(LINE_END) == null) { - if (title == null) { - atLineEnd = false; - } else { - // the potential title we found is not at the line end, - // but it could still be a legal link reference if we - // discard the title - title = null; - // rewind before spaces - index = beforeTitle; - // and instead check if the link URL is at the line end - atLineEnd = match(LINE_END) != null; - } - } - - if (!atLineEnd) { - return 0; - } - - String normalizedLabel = Escaping.normalizeReference(rawLabel); - if (normalizedLabel.isEmpty()) { - return 0; - } - - if (!referenceMap.containsKey(normalizedLabel)) { - Link link = new Link(dest, title); - referenceMap.put(normalizedLabel, link); - } - return index - startIndex; + this.lastDelimiter = null; + this.lastBracket = null; } + private Text text(String text, int beginIndex, int endIndex) { return new Text(text.substring(beginIndex, endIndex)); } @@ -419,8 +335,18 @@ private Node parseBackticks() { if (matched.equals(ticks)) { Code node = new Code(); String content = input.substring(afterOpenTicks, index - ticks.length()); - String literal = WHITESPACE.matcher(content.trim()).replaceAll(" "); - node.setLiteral(literal); + content = content.replace('\n', ' '); + + // spec: If the resulting string both begins and ends with a space character, but does not consist + // entirely of space characters, a single space character is removed from the front and back. + if (content.length() >= 3 && + content.charAt(0) == ' ' && + content.charAt(content.length() - 1) == ' ' && + Parsing.hasNonSpace(content)) { + content = content.substring(1, content.length() - 1); + } + + node.setLiteral(content); return node; } } @@ -542,7 +468,8 @@ private Node parseCloseBracket() { // See if there's a link label like `[bar]` or `[]` int beforeLabel = index; - int labelLength = parseLinkLabel(); + parseLinkLabel(); + int labelLength = index - beforeLabel; String ref = null; if (labelLength > 2) { ref = input.substring(beforeLabel, beforeLabel + labelLength); @@ -554,10 +481,11 @@ private Node parseCloseBracket() { } if (ref != null) { - Link link = referenceMap.get(Escaping.normalizeReference(ref)); - if (link != null) { - dest = link.getDestination(); - title = link.getTitle(); + String label = Escaping.normalizeReference(ref); + LinkReferenceDefinition definition = context.getLinkReferenceDefinition(label); + if (definition != null) { + dest = definition.getDestination(); + title = definition.getTitle(); isLinkOrImage = true; } } @@ -618,83 +546,58 @@ private void removeLastBracket() { * Attempt to parse link destination, returning the string or null if no match. */ private String parseLinkDestination() { - String res = match(LINK_DESTINATION_BRACES); - if (res != null) { // chop off surrounding <..>: - if (res.length() == 2) { - return ""; - } else { - return Escaping.unescapeString(res.substring(1, res.length() - 1)); - } - } else { - int startIndex = index; - parseLinkDestinationWithBalancedParens(); - return Escaping.unescapeString(input.substring(startIndex, index)); + int afterDest = LinkScanner.scanLinkDestination(input, index); + if (afterDest == -1) { + return null; } - } - private void parseLinkDestinationWithBalancedParens() { - int parens = 0; - while (true) { - char c = peek(); - switch (c) { - case '\0': - return; - case '\\': - // check if we have an escapable character - if (index + 1 < input.length() && ESCAPABLE.matcher(input.substring(index + 1, index + 2)).matches()) { - // skip over the escaped character (after switch) - index++; - break; - } - // otherwise, we treat this as a literal backslash - break; - case '(': - parens++; - break; - case ')': - if (parens == 0) { - return; - } else { - parens--; - } - break; - case ' ': - // ASCII space - return; - default: - // or control character - if (Character.isISOControl(c)) { - return; - } - } - index++; + String dest; + if (peek() == '<') { + // chop off surrounding <..>: + dest = input.substring(index + 1, afterDest - 1); + } else { + dest = input.substring(index, afterDest); } + + index = afterDest; + return Escaping.unescapeString(dest); } /** * Attempt to parse link title (sans quotes), returning the string or null if no match. */ private String parseLinkTitle() { - String title = match(LINK_TITLE); - if (title != null) { - // chop off quotes from title and unescape: - return Escaping.unescapeString(title.substring(1, title.length() - 1)); - } else { + int afterTitle = LinkScanner.scanLinkTitle(input, index); + if (afterTitle == -1) { return null; } + + // chop off ', " or parens + String title = input.substring(index + 1, afterTitle - 1); + index = afterTitle; + return Escaping.unescapeString(title); } /** * Attempt to parse a link label, returning number of characters parsed. */ - private int parseLinkLabel() { - String m = match(LINK_LABEL); - // Spec says "A link label can have at most 999 characters inside the square brackets" - if (m == null || m.length() > 1001) { + int parseLinkLabel() { + if (index >= input.length() || input.charAt(index) != '[') { + return 0; + } + + int startContent = index + 1; + int endContent = LinkScanner.scanLinkLabelContent(input, startContent); + // spec: A link label can have at most 999 characters inside the square brackets. + int contentLength = endContent - startContent; + if (endContent == -1 || contentLength > 999) { + return 0; + } + if (endContent >= input.length() || input.charAt(endContent) != ']') { return 0; - } else { - return m.length(); } + index = endContent + 1; + return contentLength + 2; } /** diff --git a/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java new file mode 100644 index 000000000..1fe2cbea7 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/LinkReferenceDefinitionParser.java @@ -0,0 +1,262 @@ +package org.commonmark.internal; + +import org.commonmark.internal.util.Escaping; +import org.commonmark.internal.util.LinkScanner; +import org.commonmark.internal.util.Parsing; +import org.commonmark.node.LinkReferenceDefinition; + +import java.util.ArrayList; +import java.util.List; + +/** + * Parser for link reference definitions at the beginning of a paragraph. + * + * @see Link reference definitions + */ +public class LinkReferenceDefinitionParser { + + private State state = State.START_DEFINITION; + + private final StringBuilder paragraph = new StringBuilder(); + private final List definitions = new ArrayList<>(); + + private StringBuilder label; + private String normalizedLabel; + private String destination; + private char titleDelimiter; + private StringBuilder title; + private boolean referenceValid = false; + + public void parse(CharSequence line) { + if (paragraph.length() != 0) { + paragraph.append('\n'); + } + paragraph.append(line); + + int i = 0; + while (i < line.length()) { + switch (state) { + case PARAGRAPH: { + // We're in a paragraph now. Link reference definitions can only appear at the beginning, so once + // we're in a paragraph, there's no going back. + return; + } + case START_DEFINITION: { + i = startDefinition(line, i); + break; + } + case LABEL: { + i = label(line, i); + break; + } + case DESTINATION: { + i = destination(line, i); + break; + } + case START_TITLE: { + i = startTitle(line, i); + break; + } + case TITLE: { + i = title(line, i); + break; + } + } + // -1 is returned if parsing failed, which means we fall back to treating text as a paragraph. + if (i == -1) { + state = State.PARAGRAPH; + return; + } + } + } + + CharSequence getParagraphContent() { + return paragraph; + } + + List getDefinitions() { + finishReference(); + return definitions; + } + + State getState() { + return state; + } + + private int startDefinition(CharSequence line, int i) { + i = Parsing.skipSpaceTab(line, i, line.length()); + if (i >= line.length() || line.charAt(i) != '[') { + return -1; + } + + state = State.LABEL; + label = new StringBuilder(); + + int labelStart = i + 1; + if (labelStart >= line.length()) { + label.append('\n'); + } + + return labelStart; + } + + private int label(CharSequence line, int i) { + int afterLabel = LinkScanner.scanLinkLabelContent(line, i); + if (afterLabel == -1) { + return -1; + } + + label.append(line, i, afterLabel); + + if (afterLabel >= line.length()) { + // label might continue on next line + label.append('\n'); + return afterLabel; + } else if (line.charAt(afterLabel) == ']') { + int colon = afterLabel + 1; + // end of label + if (colon >= line.length() || line.charAt(colon) != ':') { + return -1; + } + + // spec: A link label can have at most 999 characters inside the square brackets. + if (label.length() > 999) { + return -1; + } + + String normalizedLabel = Escaping.normalizeLabelContent(label.toString()); + if (normalizedLabel.isEmpty()) { + return -1; + } + + this.normalizedLabel = normalizedLabel; + state = State.DESTINATION; + + return Parsing.skipSpaceTab(line, colon + 1, line.length()); + } else { + return -1; + } + } + + private int destination(CharSequence line, int i) { + i = Parsing.skipSpaceTab(line, i, line.length()); + int afterDestination = LinkScanner.scanLinkDestination(line, i); + if (afterDestination == -1) { + return -1; + } + + destination = (line.charAt(i) == '<') + ? line.subSequence(i + 1, afterDestination - 1).toString() + : line.subSequence(i, afterDestination).toString(); + + int afterSpace = Parsing.skipSpaceTab(line, afterDestination, line.length()); + if (afterSpace >= line.length()) { + // Destination was at end of line, so this is a valid reference for sure (and maybe a title). + // If not at end of line, wait for title to be valid first. + referenceValid = true; + paragraph.setLength(0); + } else if (afterSpace == afterDestination) { + // spec: The title must be separated from the link destination by whitespace + return -1; + } + + state = State.START_TITLE; + return afterSpace; + } + + private int startTitle(CharSequence line, int i) { + i = Parsing.skipSpaceTab(line, i, line.length()); + if (i >= line.length()) { + state = State.START_DEFINITION; + return i; + } + + titleDelimiter = '\0'; + char c = line.charAt(i); + switch (c) { + case '"': + case '\'': + titleDelimiter = c; + break; + case '(': + titleDelimiter = ')'; + break; + } + + if (titleDelimiter != '\0') { + state = State.TITLE; + title = new StringBuilder(); + i++; + if (i == line.length()) { + title.append('\n'); + } + } else { + finishReference(); + // There might be another reference instead, try that for the same character. + state = State.START_DEFINITION; + } + return i; + } + + private int title(CharSequence line, int i) { + int afterTitle = LinkScanner.scanLinkTitleContent(line, i, titleDelimiter); + if (afterTitle == -1) { + // Invalid title, stop + return -1; + } + + title.append(line.subSequence(i, afterTitle)); + + if (afterTitle >= line.length()) { + // Title still going, continue on next line + title.append('\n'); + return afterTitle; + } + + int afterTitleDelimiter = afterTitle + 1; + int afterSpace = Parsing.skipSpaceTab(line, afterTitleDelimiter, line.length()); + if (afterSpace != line.length()) { + // spec: No further non-whitespace characters may occur on the line. + return -1; + } + referenceValid = true; + finishReference(); + paragraph.setLength(0); + + // See if there's another definition. + state = State.START_DEFINITION; + return afterSpace; + } + + private void finishReference() { + if (!referenceValid) { + return; + } + + String d = Escaping.unescapeString(destination); + String t = title != null ? Escaping.unescapeString(title.toString()) : null; + definitions.add(new LinkReferenceDefinition(normalizedLabel, d, t)); + + label = null; + referenceValid = false; + normalizedLabel = null; + destination = null; + title = null; + } + + enum State { + // Looking for the start of a definition, i.e. `[` + START_DEFINITION, + // Parsing the label, i.e. `foo` within `[foo]` + LABEL, + // Parsing the destination, i.e. `/url` in `[foo]: /url` + DESTINATION, + // Looking for the start of a title, i.e. the first `"` in `[foo]: /url "title"` + START_TITLE, + // Parsing the content of the title, i.e. `title` in `[foo]: /url "title"` + TITLE, + + // End state, no matter what kind of lines we add, they won't be references + PARAGRAPH, + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java b/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java index 28f9bfb0f..de1558f92 100644 --- a/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/ListBlockParser.java @@ -205,7 +205,7 @@ public static class Factory extends AbstractBlockParserFactory { public BlockStart tryStart(ParserState state, MatchedBlockParser matchedBlockParser) { BlockParser matched = matchedBlockParser.getMatchedBlockParser(); - if (state.getIndent() >= Parsing.CODE_BLOCK_INDENT && !(matched instanceof ListBlockParser)) { + if (state.getIndent() >= Parsing.CODE_BLOCK_INDENT) { return BlockStart.none(); } int markerIndex = state.getNextNonSpaceIndex(); diff --git a/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java b/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java index fac8cfadc..fc44cfd57 100644 --- a/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java +++ b/commonmark/src/main/java/org/commonmark/internal/ParagraphParser.java @@ -1,17 +1,19 @@ package org.commonmark.internal; -import org.commonmark.internal.util.Parsing; import org.commonmark.node.Block; +import org.commonmark.node.LinkReferenceDefinition; import org.commonmark.node.Paragraph; +import org.commonmark.parser.InlineParser; import org.commonmark.parser.block.AbstractBlockParser; import org.commonmark.parser.block.BlockContinue; -import org.commonmark.parser.InlineParser; import org.commonmark.parser.block.ParserState; +import java.util.List; + public class ParagraphParser extends AbstractBlockParser { private final Paragraph block = new Paragraph(); - private BlockContent content = new BlockContent(); + private LinkReferenceDefinitionParser linkReferenceDefinitionParser = new LinkReferenceDefinitionParser(); @Override public Block getBlock() { @@ -29,40 +31,29 @@ public BlockContinue tryContinue(ParserState state) { @Override public void addLine(CharSequence line) { - content.add(line); + linkReferenceDefinitionParser.parse(line); } @Override public void closeBlock() { - } - - public void closeBlock(ReferenceParser inlineParser) { - String contentString = content.getString(); - boolean hasReferenceDefs = false; - - int pos; - // try parsing the beginning as link reference definitions: - while (contentString.length() > 3 && contentString.charAt(0) == '[' && - (pos = inlineParser.parseReference(contentString)) != 0) { - contentString = contentString.substring(pos); - hasReferenceDefs = true; - } - if (hasReferenceDefs && Parsing.isBlank(contentString)) { + if (linkReferenceDefinitionParser.getParagraphContent().length() == 0) { block.unlink(); - content = null; - } else { - content = new BlockContent(contentString); } } @Override public void parseInlines(InlineParser inlineParser) { - if (content != null) { - inlineParser.parse(content.getString(), block); + CharSequence content = linkReferenceDefinitionParser.getParagraphContent(); + if (content.length() > 0) { + inlineParser.parse(content.toString(), block); } } - public String getContentString() { - return content.getString(); + public CharSequence getContentString() { + return linkReferenceDefinitionParser.getParagraphContent(); + } + + public List getDefinitions() { + return linkReferenceDefinitionParser.getDefinitions(); } } diff --git a/commonmark/src/main/java/org/commonmark/internal/ReferenceParser.java b/commonmark/src/main/java/org/commonmark/internal/ReferenceParser.java deleted file mode 100644 index 35f36cb59..000000000 --- a/commonmark/src/main/java/org/commonmark/internal/ReferenceParser.java +++ /dev/null @@ -1,11 +0,0 @@ -package org.commonmark.internal; - -/** - * Parser for inline references - */ -public interface ReferenceParser { - /** - * @return how many characters were parsed as a reference, {@code 0} if none - */ - int parseReference(String s); -} diff --git a/commonmark/src/main/java/org/commonmark/internal/inline/EmphasisDelimiterProcessor.java b/commonmark/src/main/java/org/commonmark/internal/inline/EmphasisDelimiterProcessor.java index 60f192bef..98b43938c 100644 --- a/commonmark/src/main/java/org/commonmark/internal/inline/EmphasisDelimiterProcessor.java +++ b/commonmark/src/main/java/org/commonmark/internal/inline/EmphasisDelimiterProcessor.java @@ -33,7 +33,9 @@ public int getMinLength() { @Override public int getDelimiterUse(DelimiterRun opener, DelimiterRun closer) { // "multiple of 3" rule for internal delimiter runs - if ((opener.canClose() || closer.canOpen()) && (opener.originalLength() + closer.originalLength()) % 3 == 0) { + if ((opener.canClose() || closer.canOpen()) && + closer.originalLength() % 3 != 0 && + (opener.originalLength() + closer.originalLength()) % 3 == 0) { return 0; } // calculate actual number of delimiters used from this closer diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java b/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java index 9136b56f8..15197556c 100644 --- a/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java +++ b/commonmark/src/main/java/org/commonmark/internal/util/Escaping.java @@ -9,20 +9,13 @@ public class Escaping { public static final String ESCAPABLE = "[!\"#$%&\'()*+,./:;<=>?@\\[\\\\\\]^_`{|}~-]"; - private static final String ENTITY = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"; + public static final String ENTITY = "&(?:#x[a-f0-9]{1,6}|#[0-9]{1,7}|[a-z][a-z0-9]{1,31});"; private static final Pattern BACKSLASH_OR_AMP = Pattern.compile("[\\\\&]"); private static final Pattern ENTITY_OR_ESCAPED_CHAR = Pattern.compile("\\\\" + ESCAPABLE + '|' + ENTITY, Pattern.CASE_INSENSITIVE); - private static final String XML_SPECIAL = "[&<>\"]"; - - private static final Pattern XML_SPECIAL_RE = Pattern.compile(XML_SPECIAL); - - private static final Pattern XML_SPECIAL_OR_ENTITY = - Pattern.compile(ENTITY + '|' + XML_SPECIAL, Pattern.CASE_INSENSITIVE); - // From RFC 3986 (see "reserved", "unreserved") except don't escape '[' or ']' to be compatible with JS encodeURI private static final Pattern ESCAPE_IN_URI = Pattern.compile("(%[a-fA-F0-9]{0,2}|[^:/?#@!$&'()*+,;=a-zA-Z0-9\\-._~])"); @@ -32,28 +25,6 @@ public class Escaping { private static final Pattern WHITESPACE = Pattern.compile("[ \t\r\n]+"); - private static final Replacer UNSAFE_CHAR_REPLACER = new Replacer() { - @Override - public void replace(String input, StringBuilder sb) { - switch (input) { - case "&": - sb.append("&"); - break; - case "<": - sb.append("<"); - break; - case ">": - sb.append(">"); - break; - case "\"": - sb.append("""); - break; - default: - sb.append(input); - } - } - }; - private static final Replacer UNESCAPE_REPLACER = new Replacer() { @Override public void replace(String input, StringBuilder sb) { @@ -88,9 +59,41 @@ public void replace(String input, StringBuilder sb) { } }; - public static String escapeHtml(String input, boolean preserveEntities) { - Pattern p = preserveEntities ? XML_SPECIAL_OR_ENTITY : XML_SPECIAL_RE; - return replaceAll(p, input, UNSAFE_CHAR_REPLACER); + public static String escapeHtml(String input) { + // Avoid building a new string in the majority of cases (nothing to escape) + StringBuilder sb = null; + + loop: + for (int i = 0; i < input.length(); i++) { + char c = input.charAt(i); + String replacement; + switch (c) { + case '&': + replacement = "&"; + break; + case '<': + replacement = "<"; + break; + case '>': + replacement = ">"; + break; + case '\"': + replacement = """; + break; + default: + if (sb != null) { + sb.append(c); + } + continue loop; + } + if (sb == null) { + sb = new StringBuilder(); + sb.append(input, 0, i); + } + sb.append(replacement); + } + + return sb != null ? sb.toString() : input; } /** @@ -109,9 +112,14 @@ public static String percentEncodeUrl(String s) { } public static String normalizeReference(String input) { - // Strip '[' and ']', then trim - String stripped = input.substring(1, input.length() - 1).trim(); - String lowercase = stripped.toLowerCase(Locale.ROOT); + // Strip '[' and ']' + String stripped = input.substring(1, input.length() - 1); + return normalizeLabelContent(stripped); + } + + public static String normalizeLabelContent(String input) { + String trimmed = input.trim(); + String lowercase = trimmed.toLowerCase(Locale.ROOT); return WHITESPACE.matcher(lowercase).replaceAll(" "); } diff --git a/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java b/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java new file mode 100644 index 000000000..f25cd59e5 --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/internal/util/LinkScanner.java @@ -0,0 +1,148 @@ +package org.commonmark.internal.util; + +public class LinkScanner { + + /** + * Attempt to scan the contents of a link label (inside the brackets), returning the position after the content or + * -1. The returned position can either be the closing {@code ]}, or the end of the line if the label continues on + * the next line. + */ + public static int scanLinkLabelContent(CharSequence input, int start) { + for (int i = start; i < input.length(); i++) { + char c = input.charAt(i); + switch (c) { + case '\\': + if (Parsing.isEscapable(input, i + 1)) { + i += 1; + } + break; + case ']': + return i; + case '[': + // spec: Unescaped square bracket characters are not allowed inside the opening and closing + // square brackets of link labels. + return -1; + } + } + return input.length(); + } + + /** + * Attempt to scan a link destination, returning the position after the destination or -1. + */ + public static int scanLinkDestination(CharSequence input, int start) { + if (start >= input.length()) { + return -1; + } + + if (input.charAt(start) == '<') { + for (int i = start + 1; i < input.length(); i++) { + char c = input.charAt(i); + switch (c) { + case '\\': + if (Parsing.isEscapable(input, i + 1)) { + i += 1; + } + break; + case '\n': + case '<': + return -1; + case '>': + return i + 1; + } + } + return -1; + } else { + return scanLinkDestinationWithBalancedParens(input, start); + } + } + + public static int scanLinkTitle(CharSequence input, int start) { + if (start >= input.length()) { + return -1; + } + + char endDelimiter; + switch (input.charAt(start)) { + case '"': + endDelimiter = '"'; + break; + case '\'': + endDelimiter = '\''; + break; + case '(': + endDelimiter = ')'; + break; + default: + return -1; + } + + int afterContent = scanLinkTitleContent(input, start + 1, endDelimiter); + if (afterContent == -1) { + return -1; + } + + if (afterContent >= input.length() || input.charAt(afterContent) != endDelimiter) { + // missing or wrong end delimiter + return -1; + } + + return afterContent + 1; + } + + public static int scanLinkTitleContent(CharSequence input, int start, char endDelimiter) { + for (int i = start; i < input.length(); i++) { + char c = input.charAt(i); + if (c == '\\' && Parsing.isEscapable(input, i + 1)) { + i += 1; + } else if (c == endDelimiter) { + return i; + } else if (endDelimiter == ')' && c == '(') { + // unescaped '(' in title within parens is invalid + return -1; + } + } + return input.length(); + } + + // spec: a nonempty sequence of characters that does not start with <, does not include ASCII space or control + // characters, and includes parentheses only if (a) they are backslash-escaped or (b) they are part of a balanced + // pair of unescaped parentheses + private static int scanLinkDestinationWithBalancedParens(CharSequence input, int start) { + int parens = 0; + for (int i = start; i < input.length(); i++) { + char c = input.charAt(i); + switch (c) { + case '\0': + case ' ': + return i != start ? i : -1; + case '\\': + if (Parsing.isEscapable(input, i + 1)) { + i += 1; + } + break; + case '(': + parens++; + // Limit to 32 nested parens for pathological cases + if (parens > 32) { + return -1; + } + break; + case ')': + if (parens == 0) { + return i; + } else { + parens--; + } + break; + default: + // or control character + if (Character.isISOControl(c)) { + return i != start ? i : -1; + } + break; + } + } + return input.length(); + } +} diff --git a/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java b/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java index f5cc888ee..d429d9db0 100644 --- a/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java +++ b/commonmark/src/main/java/org/commonmark/internal/util/Parsing.java @@ -50,6 +50,12 @@ public static boolean isBlank(CharSequence s) { return findNonSpace(s, 0) == -1; } + public static boolean hasNonSpace(CharSequence s) { + int length = s.length(); + int skipped = skip(' ', s, 0, length); + return skipped != length; + } + public static boolean isLetter(CharSequence s, int index) { int codePoint = Character.codePointAt(s, index); return Character.isLetter(codePoint); @@ -66,6 +72,47 @@ public static boolean isSpaceOrTab(CharSequence s, int index) { return false; } + public static boolean isEscapable(CharSequence s, int index) { + if (index < s.length()) { + switch (s.charAt(index)) { + case '!': + case '"': + case '#': + case '$': + case '%': + case '&': + case '\'': + case '(': + case ')': + case '*': + case '+': + case ',': + case '-': + case '.': + case '/': + case ':': + case ';': + case '<': + case '=': + case '>': + case '?': + case '@': + case '[': + case '\\': + case ']': + case '^': + case '_': + case '`': + case '{': + case '|': + case '}': + case '~': + return true; + } + } + return false; + } + /** * Prepares the input line replacing {@code \0} */ diff --git a/commonmark/src/main/java/org/commonmark/node/AbstractVisitor.java b/commonmark/src/main/java/org/commonmark/node/AbstractVisitor.java index 381c72b66..7edd635d7 100644 --- a/commonmark/src/main/java/org/commonmark/node/AbstractVisitor.java +++ b/commonmark/src/main/java/org/commonmark/node/AbstractVisitor.java @@ -108,6 +108,11 @@ public void visit(Text text) { visitChildren(text); } + @Override + public void visit(LinkReferenceDefinition linkReferenceDefinition) { + visitChildren(linkReferenceDefinition); + } + @Override public void visit(CustomBlock customBlock) { visitChildren(customBlock); diff --git a/commonmark/src/main/java/org/commonmark/node/LinkReferenceDefinition.java b/commonmark/src/main/java/org/commonmark/node/LinkReferenceDefinition.java new file mode 100644 index 000000000..a4578e99b --- /dev/null +++ b/commonmark/src/main/java/org/commonmark/node/LinkReferenceDefinition.java @@ -0,0 +1,60 @@ +package org.commonmark.node; + +// TODO: We're currently not adding these to the document. +// But that would be very useful for being able to render Nodes back to Markdown, see #98. + +/** + * A link reference definition, e.g.: + *

+ * [foo]: /url "title"
+ * 
+ *

+ * They can be referenced anywhere else in the document to produce a link using [foo]. The definitions + * themselves are usually not rendered in the final output. + * + * @see Link reference definitions + */ +public class LinkReferenceDefinition extends Node { + + private String label; + private String destination; + private String title; + + public LinkReferenceDefinition() { + } + + public LinkReferenceDefinition(String label, String destination, String title) { + this.label = label; + this.destination = destination; + this.title = title; + } + + public String getLabel() { + return label; + } + + public void setLabel(String label) { + this.label = label; + } + + public String getDestination() { + return destination; + } + + public void setDestination(String destination) { + this.destination = destination; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + @Override + public void accept(Visitor visitor) { + visitor.visit(this); + } +} diff --git a/commonmark/src/main/java/org/commonmark/node/Visitor.java b/commonmark/src/main/java/org/commonmark/node/Visitor.java index 8851b7b18..a155296f0 100644 --- a/commonmark/src/main/java/org/commonmark/node/Visitor.java +++ b/commonmark/src/main/java/org/commonmark/node/Visitor.java @@ -3,7 +3,7 @@ /** * Node visitor. *

- * See {@link AbstractVisitor} for a base class that can be extended. + * Implementations should subclass {@link AbstractVisitor} instead of implementing this directly. */ public interface Visitor { @@ -47,6 +47,8 @@ public interface Visitor { void visit(Text text); + void visit(LinkReferenceDefinition linkReferenceDefinition); + void visit(CustomBlock customBlock); void visit(CustomNode customNode); diff --git a/commonmark/src/main/java/org/commonmark/parser/InlineParserContext.java b/commonmark/src/main/java/org/commonmark/parser/InlineParserContext.java index 7a3be522d..467742e2c 100644 --- a/commonmark/src/main/java/org/commonmark/parser/InlineParserContext.java +++ b/commonmark/src/main/java/org/commonmark/parser/InlineParserContext.java @@ -1,13 +1,25 @@ package org.commonmark.parser; +import org.commonmark.node.LinkReferenceDefinition; import org.commonmark.parser.delimiter.DelimiterProcessor; import java.util.List; -import java.util.Map; /** - * Parameter context for custom inline parser. + * Context for inline parsing. */ public interface InlineParserContext { + + /** + * @return custom delimiter processors that have been configured with {@link Parser.Builder#customDelimiterProcessor(DelimiterProcessor)} + */ List getCustomDelimiterProcessors(); + + /** + * Look up a {@link LinkReferenceDefinition} for a given label. + * + * @param label the link label to look up + * @return the definition if one exists, {@code null} otherwise + */ + LinkReferenceDefinition getLinkReferenceDefinition(String label); } diff --git a/commonmark/src/main/java/org/commonmark/parser/Parser.java b/commonmark/src/main/java/org/commonmark/parser/Parser.java index 04d28065f..5e15158ad 100644 --- a/commonmark/src/main/java/org/commonmark/parser/Parser.java +++ b/commonmark/src/main/java/org/commonmark/parser/Parser.java @@ -2,6 +2,7 @@ import org.commonmark.Extension; import org.commonmark.internal.DocumentParser; +import org.commonmark.internal.InlineParserContextImpl; import org.commonmark.internal.InlineParserImpl; import org.commonmark.node.*; import org.commonmark.parser.block.BlockParserFactory; @@ -10,6 +11,7 @@ import java.io.IOException; import java.io.Reader; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Set; @@ -32,12 +34,14 @@ public class Parser { private Parser(Builder builder) { this.blockParserFactories = DocumentParser.calculateBlockParserFactories(builder.blockParserFactories, builder.enabledBlockTypes); - this.inlineParserFactory = builder.inlineParserFactory; + this.inlineParserFactory = builder.getInlineParserFactory(); this.postProcessors = builder.postProcessors; this.delimiterProcessors = builder.delimiterProcessors; - // Try to construct an inline parser. This might raise exceptions in case of invalid configuration. - getInlineParser(); + // Try to construct an inline parser. Invalid configuration might result in an exception, which we want to + // detect as soon as possible. + this.inlineParserFactory.create(new InlineParserContextImpl(delimiterProcessors, + Collections.emptyMap())); } /** @@ -61,8 +65,7 @@ public Node parse(String input) { if (input == null) { throw new NullPointerException("input must not be null"); } - InlineParser inlineParser = getInlineParser(); - DocumentParser documentParser = new DocumentParser(blockParserFactories, inlineParser); + DocumentParser documentParser = createDocumentParser(); Node document = documentParser.parse(input); return postProcess(document); } @@ -89,19 +92,14 @@ public Node parseReader(Reader input) throws IOException { if (input == null) { throw new NullPointerException("input must not be null"); } - InlineParser inlineParser = getInlineParser(); - DocumentParser documentParser = new DocumentParser(blockParserFactories, inlineParser); + + DocumentParser documentParser = createDocumentParser(); Node document = documentParser.parse(input); return postProcess(document); } - private InlineParser getInlineParser() { - if (this.inlineParserFactory == null) { - return new InlineParserImpl(delimiterProcessors); - } else { - CustomInlineParserContext inlineParserContext = new CustomInlineParserContext(delimiterProcessors); - return this.inlineParserFactory.create(inlineParserContext); - } + private DocumentParser createDocumentParser() { + return new DocumentParser(blockParserFactories, inlineParserFactory, delimiterProcessors); } private Node postProcess(Node document) { @@ -111,20 +109,6 @@ private Node postProcess(Node document) { return document; } - private class CustomInlineParserContext implements InlineParserContext { - - private List delimiterProcessors; - - CustomInlineParserContext(List delimiterProcessors) { - this.delimiterProcessors = delimiterProcessors; - } - - @Override - public List getCustomDelimiterProcessors() { - return delimiterProcessors; - } - } - /** * Builder for configuring a {@link Parser}. */ @@ -133,7 +117,7 @@ public static class Builder { private final List delimiterProcessors = new ArrayList<>(); private final List postProcessors = new ArrayList<>(); private Set> enabledBlockTypes = DocumentParser.getDefaultBlockParserTypes(); - private InlineParserFactory inlineParserFactory = null; + private InlineParserFactory inlineParserFactory; /** * @return the configured {@link Parser} @@ -261,6 +245,18 @@ public Builder inlineParserFactory(InlineParserFactory inlineParserFactory) { this.inlineParserFactory = inlineParserFactory; return this; } + + private InlineParserFactory getInlineParserFactory() { + if (inlineParserFactory != null) { + return inlineParserFactory; + } + return new InlineParserFactory() { + @Override + public InlineParser create(InlineParserContext inlineParserContext) { + return new InlineParserImpl(inlineParserContext); + } + }; + } } /** diff --git a/commonmark/src/main/java/org/commonmark/renderer/html/HtmlWriter.java b/commonmark/src/main/java/org/commonmark/renderer/html/HtmlWriter.java index ec38e8f39..8c79eb8b4 100644 --- a/commonmark/src/main/java/org/commonmark/renderer/html/HtmlWriter.java +++ b/commonmark/src/main/java/org/commonmark/renderer/html/HtmlWriter.java @@ -25,7 +25,7 @@ public void raw(String s) { } public void text(String text) { - append(Escaping.escapeHtml(text, false)); + append(Escaping.escapeHtml(text)); } public void tag(String name) { @@ -42,9 +42,9 @@ public void tag(String name, Map attrs, boolean voidElement) { if (attrs != null && !attrs.isEmpty()) { for (Map.Entry attrib : attrs.entrySet()) { append(" "); - append(Escaping.escapeHtml(attrib.getKey(), true)); + append(Escaping.escapeHtml(attrib.getKey())); append("=\""); - append(Escaping.escapeHtml(attrib.getValue(), true)); + append(Escaping.escapeHtml(attrib.getValue())); append("\""); } } diff --git a/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java b/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java new file mode 100644 index 000000000..f0bdef492 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/internal/LinkReferenceDefinitionParserTest.java @@ -0,0 +1,179 @@ +package org.commonmark.internal; + +import org.commonmark.internal.LinkReferenceDefinitionParser.State; +import org.commonmark.node.LinkReferenceDefinition; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class LinkReferenceDefinitionParserTest { + + private LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + + @Test + public void testStartLabel() { + parser.parse("["); + assertEquals(State.LABEL, parser.getState()); + assertEquals("[", parser.getParagraphContent().toString()); + } + + @Test + public void testStartNoLabel() { + // Not a label + assertParagraph("a"); + // Can not go back to parsing link reference definitions + parser.parse("a"); + parser.parse("["); + assertEquals(State.PARAGRAPH, parser.getState()); + assertEquals("a\n[", parser.getParagraphContent().toString()); + } + + @Test + public void testEmptyLabel() { + assertParagraph("[]: /"); + assertParagraph("[ ]: /"); + assertParagraph("[ \t\n\u000B\f\r ]: /"); + } + + @Test + public void testLabelColon() { + assertParagraph("[foo] : /"); + } + + @Test + public void testLabel() { + assertState("[foo]:", State.DESTINATION, "[foo]:"); + assertState("[ foo ]:", State.DESTINATION, "[ foo ]:"); + } + + @Test + public void testLabelInvalid() { + assertParagraph("[foo[]:"); + } + + @Test + public void testLabelMultiline() { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse("[two"); + assertEquals(State.LABEL, parser.getState()); + parser.parse("lines]:"); + assertEquals(State.DESTINATION, parser.getState()); + parser.parse("/url"); + assertEquals(State.START_TITLE, parser.getState()); + assertDef(parser.getDefinitions().get(0), "two lines", "/url", null); + } + + @Test + public void testLabelStartsWithNewline() { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse("["); + assertEquals(State.LABEL, parser.getState()); + parser.parse("weird]:"); + assertEquals(State.DESTINATION, parser.getState()); + parser.parse("/url"); + assertEquals(State.START_TITLE, parser.getState()); + assertDef(parser.getDefinitions().get(0), "weird", "/url", null); + } + + @Test + public void testDestination() { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse("[foo]: /url"); + assertEquals(State.START_TITLE, parser.getState()); + assertEquals("", parser.getParagraphContent().toString()); + + assertEquals(1, parser.getDefinitions().size()); + assertDef(parser.getDefinitions().get(0), "foo", "/url", null); + + parser.parse("[bar]: "); + assertDef(parser.getDefinitions().get(1), "bar", "/url2", null); + } + + @Test + public void testDestinationInvalid() { + assertParagraph("[foo]: "); + } + + @Test + public void testTitle() { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse("[foo]: /url 'title'"); + assertEquals(State.START_DEFINITION, parser.getState()); + assertEquals("", parser.getParagraphContent().toString()); + + assertEquals(1, parser.getDefinitions().size()); + assertDef(parser.getDefinitions().get(0), "foo", "/url", "title"); + } + + @Test + public void testTitleStartWhitespace() { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse("[foo]: /url"); + assertEquals(State.START_TITLE, parser.getState()); + assertEquals("", parser.getParagraphContent().toString()); + + parser.parse(" "); + + assertEquals(State.START_DEFINITION, parser.getState()); + assertEquals(" ", parser.getParagraphContent().toString()); + + assertEquals(1, parser.getDefinitions().size()); + assertDef(parser.getDefinitions().get(0), "foo", "/url", null); + } + + @Test + public void testTitleMultiline() { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse("[foo]: /url 'two"); + assertEquals(State.TITLE, parser.getState()); + assertEquals("[foo]: /url 'two", parser.getParagraphContent().toString()); + assertEquals(0, parser.getDefinitions().size()); + + parser.parse("lines"); + assertEquals(State.TITLE, parser.getState()); + assertEquals("[foo]: /url 'two\nlines", parser.getParagraphContent().toString()); + assertEquals(0, parser.getDefinitions().size()); + + parser.parse("'"); + assertEquals(State.START_DEFINITION, parser.getState()); + assertEquals("", parser.getParagraphContent().toString()); + + assertEquals(1, parser.getDefinitions().size()); + assertDef(parser.getDefinitions().get(0), "foo", "/url", "two\nlines\n"); + } + + @Test + public void testTitleMultiline2() { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse("[foo]: /url '"); + assertEquals(State.TITLE, parser.getState()); + parser.parse("title'"); + assertEquals(State.START_DEFINITION, parser.getState()); + + assertDef(parser.getDefinitions().get(0), "foo", "/url", "\ntitle"); + } + + @Test + public void testTitleInvalid() { + assertParagraph("[foo]: /url (invalid("); + assertParagraph("[foo]: 'title'"); + assertParagraph("[foo]: /url 'title' INVALID"); + } + + private static void assertParagraph(String input) { + assertState(input, State.PARAGRAPH, input); + } + + private static void assertState(String input, State state, String paragraphContent) { + LinkReferenceDefinitionParser parser = new LinkReferenceDefinitionParser(); + parser.parse(input); + assertEquals(state, parser.getState()); + assertEquals(paragraphContent, parser.getParagraphContent().toString()); + } + + private static void assertDef(LinkReferenceDefinition def, String label, String destination, String title) { + assertEquals(label, def.getLabel()); + assertEquals(destination, def.getDestination()); + assertEquals(title, def.getTitle()); + } +} diff --git a/commonmark/src/test/java/org/commonmark/internal/util/EscapingTest.java b/commonmark/src/test/java/org/commonmark/internal/util/EscapingTest.java new file mode 100644 index 000000000..9433eb7d0 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/internal/util/EscapingTest.java @@ -0,0 +1,21 @@ +package org.commonmark.internal.util; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class EscapingTest { + + @Test + public void testEscapeHtml() { + assertEquals("nothing to escape", Escaping.escapeHtml("nothing to escape")); + assertEquals("&", Escaping.escapeHtml("&")); + assertEquals("<", Escaping.escapeHtml("<")); + assertEquals(">", Escaping.escapeHtml(">")); + assertEquals(""", Escaping.escapeHtml("\"")); + assertEquals("< start", Escaping.escapeHtml("< start")); + assertEquals("end >", Escaping.escapeHtml("end >")); + assertEquals("< both >", Escaping.escapeHtml("< both >")); + assertEquals("< middle & too >", Escaping.escapeHtml("< middle & too >")); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/DelimiterProcessorTest.java b/commonmark/src/test/java/org/commonmark/test/DelimiterProcessorTest.java index e7348f5dd..948c484cd 100644 --- a/commonmark/src/test/java/org/commonmark/test/DelimiterProcessorTest.java +++ b/commonmark/src/test/java/org/commonmark/test/DelimiterProcessorTest.java @@ -52,18 +52,18 @@ public void asymmetricDelimiter() { @Test public void multipleDelimitersWithDifferentLengths() { Parser parser = Parser.builder() - .customDelimiterProcessor(new OneTildeDelimiterProcessor()) - .customDelimiterProcessor(new TwoTildesDelimiterProcessor()) + .customDelimiterProcessor(new OneDelimiterProcessor()) + .customDelimiterProcessor(new TwoDelimiterProcessor()) .build(); - assertEquals("

(1)one(/1) (2)two(/2)

\n", RENDERER.render(parser.parse("~one~ ~~two~~"))); - assertEquals("

(1)(2)both(/2)(/1)

\n", RENDERER.render(parser.parse("~~~both~~~"))); + assertEquals("

(1)one(/1) (2)two(/2)

\n", RENDERER.render(parser.parse("+one+ ++two++"))); + assertEquals("

(1)(2)both(/2)(/1)

\n", RENDERER.render(parser.parse("+++both+++"))); } @Test(expected = IllegalArgumentException.class) public void multipleDelimitersWithSameLength() { Parser.builder() - .customDelimiterProcessor(new OneTildeDelimiterProcessor()) - .customDelimiterProcessor(new OneTildeDelimiterProcessor()) + .customDelimiterProcessor(new OneDelimiterProcessor()) + .customDelimiterProcessor(new OneDelimiterProcessor()) .build(); } @@ -180,16 +180,16 @@ public void render(Node node) { } } - private static class OneTildeDelimiterProcessor implements DelimiterProcessor { + private static class OneDelimiterProcessor implements DelimiterProcessor { @Override public char getOpeningCharacter() { - return '~'; + return '+'; } @Override public char getClosingCharacter() { - return '~'; + return '+'; } @Override @@ -209,16 +209,16 @@ public void process(Text opener, Text closer, int delimiterUse) { } } - private static class TwoTildesDelimiterProcessor implements DelimiterProcessor { + private static class TwoDelimiterProcessor implements DelimiterProcessor { @Override public char getOpeningCharacter() { - return '~'; + return '+'; } @Override public char getClosingCharacter() { - return '~'; + return '+'; } @Override diff --git a/commonmark/src/test/java/org/commonmark/test/FencedCodeBlockParserTest.java b/commonmark/src/test/java/org/commonmark/test/FencedCodeBlockParserTest.java index 1a64a6374..774c6ff0e 100644 --- a/commonmark/src/test/java/org/commonmark/test/FencedCodeBlockParserTest.java +++ b/commonmark/src/test/java/org/commonmark/test/FencedCodeBlockParserTest.java @@ -26,14 +26,12 @@ public void backtickInfo() { public void backtickInfoDoesntAllowBacktick() { assertRendering("```info ` test\ncode\n```", "

```info ` test\ncode

\n
\n"); - // Note, it's unclear in the spec whether a ~~~ code block can contain ` in info or not, see: - // https://github.com/commonmark/CommonMark/issues/119 } @Test public void backtickAndTildeCantBeMixed() { assertRendering("``~`\ncode\n``~`", - "

~` code~`

\n"); + "

~` code ~`

\n"); } @Test diff --git a/commonmark/src/test/java/org/commonmark/test/HtmlRendererTest.java b/commonmark/src/test/java/org/commonmark/test/HtmlRendererTest.java index 6ccfe5465..30cbf24f3 100644 --- a/commonmark/src/test/java/org/commonmark/test/HtmlRendererTest.java +++ b/commonmark/src/test/java/org/commonmark/test/HtmlRendererTest.java @@ -50,6 +50,15 @@ public void textEscaping() { assertEquals("

escaping: & < > " '

\n", rendered); } + @Test + public void attributeEscaping() { + Paragraph paragraph = new Paragraph(); + Link link = new Link(); + link.setDestination(":"); + paragraph.appendChild(link); + assertEquals("

\n", defaultRenderer().render(paragraph)); + } + @Test public void percentEncodeUrlDisabled() { assertEquals("

a

\n", defaultRenderer().render(parse("[a](foo&bar)"))); diff --git a/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java b/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java index 8c5d57dd4..a853b1b11 100644 --- a/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java +++ b/commonmark/src/test/java/org/commonmark/test/PathologicalTest.java @@ -106,4 +106,18 @@ public void hugeHorizontalRule() { repeat("*", 10000) + "\n", "
\n"); } + + @Test + public void backslashInLink() { + // See https://github.com/commonmark/commonmark.js/issues/157 + assertRendering("[" + repeat("\\", x) + "\n", + "

" + "[" + repeat("\\", x / 2) + "

\n"); + } + + @Test + public void unclosedInlineLinks() { + // See https://github.com/commonmark/commonmark.js/issues/129 + assertRendering(repeat("[](", x) + "\n", + "

" + repeat("[](", x) + "

\n"); + } } diff --git a/commonmark/src/test/java/org/commonmark/test/RegressionTest.java b/commonmark/src/test/java/org/commonmark/test/RegressionTest.java index 5d49c2abd..c4a0d3be5 100644 --- a/commonmark/src/test/java/org/commonmark/test/RegressionTest.java +++ b/commonmark/src/test/java/org/commonmark/test/RegressionTest.java @@ -13,7 +13,9 @@ import java.net.URL; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; @RunWith(Parameterized.class) public class RegressionTest extends RenderingTestCase { @@ -22,6 +24,8 @@ public class RegressionTest extends RenderingTestCase { // The spec says URL-escaping is optional, but the examples assume that it's enabled. private static final HtmlRenderer RENDERER = HtmlRenderer.builder().percentEncodeUrls(true).build(); + private static final Map OVERRIDDEN_EXAMPLES = getOverriddenExamples(); + private final Example example; public RegressionTest(Example example) { @@ -42,11 +46,25 @@ public static List data() { @Test public void testHtmlRendering() { - assertRendering(example.getSource(), example.getHtml()); + String expectedHtml = OVERRIDDEN_EXAMPLES.get(example.getSource()); + if (expectedHtml == null) { + expectedHtml = example.getHtml(); + } + assertRendering(example.getSource(), expectedHtml); } @Override protected String render(String source) { return RENDERER.render(PARSER.parse(source)); } + + private static Map getOverriddenExamples() { + Map m = new HashMap<>(); + + // The only difference is that we don't change `%28` and `%29` to `(` and `)` (percent encoding is preserved) + m.put("[XSS](javascript&colon;alert%28'XSS'%29)\n", + "

XSS

\n"); + + return m; + } } diff --git a/commonmark/src/test/java/org/commonmark/test/SpecCrLfCoreTest.java b/commonmark/src/test/java/org/commonmark/test/SpecCrLfCoreTest.java new file mode 100644 index 000000000..6424ab659 --- /dev/null +++ b/commonmark/src/test/java/org/commonmark/test/SpecCrLfCoreTest.java @@ -0,0 +1,26 @@ +package org.commonmark.test; + +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.commonmark.testutil.SpecTestCase; +import org.commonmark.testutil.example.Example; + +/** + * Same as {@link SpecCoreTest} but converts line endings to Windows-style CR+LF endings before parsing. + */ +public class SpecCrLfCoreTest extends SpecTestCase { + + private static final Parser PARSER = Parser.builder().build(); + // The spec says URL-escaping is optional, but the examples assume that it's enabled. + private static final HtmlRenderer RENDERER = HtmlRenderer.builder().percentEncodeUrls(true).build(); + + public SpecCrLfCoreTest(Example example) { + super(example); + } + + @Override + protected String render(String source) { + String windowsStyle = source.replace("\n", "\r\n"); + return RENDERER.render(PARSER.parse(windowsStyle)); + } +} diff --git a/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java b/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java index 1550d3197..c2bb9fd4c 100644 --- a/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java +++ b/commonmark/src/test/java/org/commonmark/test/SpecialInputTest.java @@ -114,12 +114,13 @@ public void linkDestinationEscaping() { assertRendering("[foo](\\))", "

foo

\n"); // ` ` is not escapable, so the backslash is a literal backslash and there's an optional space at the end assertRendering("[foo](\\ )", "

foo

\n"); - // Backslash escapes `>`, so it's not a `(<...>)` link, but a `(...)` link instead - assertRendering("[foo](<\\>)", "

foo

\n"); // Backslash is a literal, so valid assertRendering("[foo]()", "

foo

\n"); // Backslash escapes `>` but there's another `>`, valid assertRendering("[foo](>)", "

foo

\n"); + + // This is a tricky one. There's `<` so we try to parse it as a `<` link but fail. + assertRendering("[foo](<\\>)", "

[foo](<>)

\n"); } // commonmark/CommonMark#468 @@ -138,4 +139,27 @@ public void linkReferenceBackslash() { public void emphasisMultipleOf3Rule() { assertRendering("a***b* c*", "

a*b c

\n"); } + + @Test + public void deeplyIndentedList() { + assertRendering("* one\n" + + " * two\n" + + " * three\n" + + " * four", + "
    \n" + + "
  • one\n" + + "
      \n" + + "
    • two\n" + + "
        \n" + + "
      • three\n" + + "
          \n" + + "
        • four
        • \n" + + "
        \n" + + "
      • \n" + + "
      \n" + + "
    • \n" + + "
    \n" + + "
  • \n" + + "
\n"); + } }