Skip to content

Commit 3733963

Browse files
authored
Merge pull request #321 from commonmark/robinst-inline-content-parser
Support for extending inline parsing with custom inline content parsers
2 parents 1bf85c6 + 6b16c69 commit 3733963

23 files changed

+393
-118
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html
77
with the exception that 0.x versions can break between minor versions.
88

99
## Unreleased
10+
### Added
11+
- Support for extending inline parsing with custom inline content parsers! See
12+
`Parser.Builder#customInlineContentParserFactory`. This allows users or
13+
extensions to hook into inline parsing on a deeper level than using delimiter
14+
processors. It could be used to implement support for math/latex formulas for
15+
example.
1016
### Fixed
1117
- Fix parsing of link reference definitions where it looks like it has a title
1218
but it doesn't because it's followed by characters other than space/tab. In that

README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,19 @@ elements in the resulting HTML, you can create your own subclass of
221221
To define the HTML rendering for them, you can use a `NodeRenderer` as
222222
explained above.
223223

224+
#### Customize parsing
225+
226+
There are a few ways to extend parsing or even override built-in parsing,
227+
all of them via methods on `Parser.Builder`
228+
(see [Blocks and inlines](https://spec.commonmark.org/0.31.2/#blocks-and-inlines) in the spec for an overview of blocks/inlines):
229+
230+
- Parsing of specific block types (e.g. headings, code blocks, etc) can be
231+
enabled/disabled with `enabledBlockTypes`
232+
- Parsing of blocks can be extended/overridden with `customBlockParserFactory`
233+
- Parsing of inline content can be extended/overridden with `customInlineContentParserFactory`
234+
- Parsing of [delimiters](https://spec.commonmark.org/0.31.2/#emphasis-and-strong-emphasis) in inline content can be
235+
extended with `customDelimiterProcessor`
236+
224237
#### Thread-safety
225238

226239
Both the `Parser` and `HtmlRenderer` are designed so that you can

commonmark/src/main/java/org/commonmark/internal/DocumentParser.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package org.commonmark.internal;
22

3+
import org.commonmark.parser.beta.InlineContentParserFactory;
34
import org.commonmark.internal.util.Parsing;
45
import org.commonmark.node.*;
56
import org.commonmark.parser.*;
@@ -66,6 +67,7 @@ public class DocumentParser implements ParserState {
6667

6768
private final List<BlockParserFactory> blockParserFactories;
6869
private final InlineParserFactory inlineParserFactory;
70+
private final List<InlineContentParserFactory> inlineContentParserFactories;
6971
private final List<DelimiterProcessor> delimiterProcessors;
7072
private final IncludeSourceSpans includeSourceSpans;
7173
private final DocumentBlockParser documentBlockParser;
@@ -75,9 +77,11 @@ public class DocumentParser implements ParserState {
7577
private final List<BlockParser> allBlockParsers = new ArrayList<>();
7678

7779
public DocumentParser(List<BlockParserFactory> blockParserFactories, InlineParserFactory inlineParserFactory,
78-
List<DelimiterProcessor> delimiterProcessors, IncludeSourceSpans includeSourceSpans) {
80+
List<InlineContentParserFactory> inlineContentParserFactories, List<DelimiterProcessor> delimiterProcessors,
81+
IncludeSourceSpans includeSourceSpans) {
7982
this.blockParserFactories = blockParserFactories;
8083
this.inlineParserFactory = inlineParserFactory;
84+
this.inlineContentParserFactories = inlineContentParserFactories;
8185
this.delimiterProcessors = delimiterProcessors;
8286
this.includeSourceSpans = includeSourceSpans;
8387

@@ -477,7 +481,7 @@ private void addDefinitionsFrom(ParagraphParser paragraphParser) {
477481
* Walk through a block & children recursively, parsing string content into inline content where appropriate.
478482
*/
479483
private void processInlines() {
480-
InlineParserContextImpl context = new InlineParserContextImpl(delimiterProcessors, definitions);
484+
InlineParserContextImpl context = new InlineParserContextImpl(inlineContentParserFactories, delimiterProcessors, definitions);
481485
InlineParser inlineParser = inlineParserFactory.create(context);
482486

483487
for (BlockParser blockParser : allBlockParsers) {

commonmark/src/main/java/org/commonmark/internal/InlineParserContextImpl.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,31 @@
11
package org.commonmark.internal;
22

3+
import org.commonmark.parser.beta.InlineContentParserFactory;
34
import org.commonmark.node.LinkReferenceDefinition;
45
import org.commonmark.parser.InlineParserContext;
56
import org.commonmark.parser.delimiter.DelimiterProcessor;
67

78
import java.util.List;
8-
import java.util.Map;
99

1010
public class InlineParserContextImpl implements InlineParserContext {
1111

12+
private final List<InlineContentParserFactory> inlineContentParserFactories;
1213
private final List<DelimiterProcessor> delimiterProcessors;
1314
private final LinkReferenceDefinitions linkReferenceDefinitions;
1415

15-
public InlineParserContextImpl(List<DelimiterProcessor> delimiterProcessors,
16+
public InlineParserContextImpl(List<InlineContentParserFactory> inlineContentParserFactories,
17+
List<DelimiterProcessor> delimiterProcessors,
1618
LinkReferenceDefinitions linkReferenceDefinitions) {
19+
this.inlineContentParserFactories = inlineContentParserFactories;
1720
this.delimiterProcessors = delimiterProcessors;
1821
this.linkReferenceDefinitions = linkReferenceDefinitions;
1922
}
2023

24+
@Override
25+
public List<InlineContentParserFactory> getCustomInlineContentParserFactories() {
26+
return inlineContentParserFactories;
27+
}
28+
2129
@Override
2230
public List<DelimiterProcessor> getCustomDelimiterProcessors() {
2331
return delimiterProcessors;

commonmark/src/main/java/org/commonmark/internal/InlineParserImpl.java

Lines changed: 69 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import org.commonmark.parser.InlineParser;
88
import org.commonmark.parser.InlineParserContext;
99
import org.commonmark.parser.SourceLines;
10-
import org.commonmark.parser.beta.Position;
10+
import org.commonmark.parser.beta.*;
1111
import org.commonmark.parser.beta.Scanner;
1212
import org.commonmark.parser.delimiter.DelimiterProcessor;
1313
import org.commonmark.text.Characters;
@@ -16,11 +16,12 @@
1616

1717
public class InlineParserImpl implements InlineParser, InlineParserState {
1818

19-
private final BitSet specialCharacters;
20-
private final Map<Character, DelimiterProcessor> delimiterProcessors;
2119
private final InlineParserContext context;
22-
private final Map<Character, List<InlineContentParser>> inlineParsers;
20+
private final List<InlineContentParserFactory> inlineContentParserFactories;
21+
private final Map<Character, DelimiterProcessor> delimiterProcessors;
22+
private final BitSet specialCharacters;
2323

24+
private Map<Character, List<InlineContentParser>> inlineParsers;
2425
private Scanner scanner;
2526
private boolean includeSourceSpans;
2627
private int trailingSpaces;
@@ -36,46 +37,31 @@ public class InlineParserImpl implements InlineParser, InlineParserState {
3637
*/
3738
private Bracket lastBracket;
3839

39-
public InlineParserImpl(InlineParserContext inlineParserContext) {
40-
this.delimiterProcessors = calculateDelimiterProcessors(inlineParserContext.getCustomDelimiterProcessors());
41-
42-
this.context = inlineParserContext;
43-
this.inlineParsers = new HashMap<>();
44-
this.inlineParsers.put('\\', Collections.<InlineContentParser>singletonList(new BackslashInlineParser()));
45-
this.inlineParsers.put('`', Collections.<InlineContentParser>singletonList(new BackticksInlineParser()));
46-
this.inlineParsers.put('&', Collections.<InlineContentParser>singletonList(new EntityInlineParser()));
47-
this.inlineParsers.put('<', Arrays.asList(new AutolinkInlineParser(), new HtmlInlineParser()));
48-
49-
this.specialCharacters = calculateSpecialCharacters(this.delimiterProcessors.keySet(), inlineParsers.keySet());
40+
public InlineParserImpl(InlineParserContext context) {
41+
this.context = context;
42+
this.inlineContentParserFactories = calculateInlineContentParserFactories(context.getCustomInlineContentParserFactories());
43+
this.delimiterProcessors = calculateDelimiterProcessors(context.getCustomDelimiterProcessors());
44+
this.specialCharacters = calculateSpecialCharacters(this.delimiterProcessors.keySet(), this.inlineContentParserFactories);
5045
}
5146

52-
public static BitSet calculateSpecialCharacters(Set<Character> delimiterCharacters, Set<Character> characters) {
53-
BitSet bitSet = new BitSet();
54-
for (Character c : delimiterCharacters) {
55-
bitSet.set(c);
56-
}
57-
for (Character c : characters) {
58-
bitSet.set(c);
59-
}
60-
bitSet.set('[');
61-
bitSet.set(']');
62-
bitSet.set('!');
63-
bitSet.set('\n');
64-
return bitSet;
47+
private List<InlineContentParserFactory> calculateInlineContentParserFactories(List<InlineContentParserFactory> customFactories) {
48+
// Custom parsers can override built-in parsers if they want, so make sure they are tried first
49+
var list = new ArrayList<>(customFactories);
50+
list.add(new BackslashInlineParser.Factory());
51+
list.add(new BackticksInlineParser.Factory());
52+
list.add(new EntityInlineParser.Factory());
53+
list.add(new AutolinkInlineParser.Factory());
54+
list.add(new HtmlInlineParser.Factory());
55+
return list;
6556
}
6657

67-
public static Map<Character, DelimiterProcessor> calculateDelimiterProcessors(List<DelimiterProcessor> delimiterProcessors) {
68-
Map<Character, DelimiterProcessor> map = new HashMap<>();
69-
addDelimiterProcessors(Arrays.<DelimiterProcessor>asList(new AsteriskDelimiterProcessor(), new UnderscoreDelimiterProcessor()), map);
58+
private static Map<Character, DelimiterProcessor> calculateDelimiterProcessors(List<DelimiterProcessor> delimiterProcessors) {
59+
var map = new HashMap<Character, DelimiterProcessor>();
60+
addDelimiterProcessors(List.of(new AsteriskDelimiterProcessor(), new UnderscoreDelimiterProcessor()), map);
7061
addDelimiterProcessors(delimiterProcessors, map);
7162
return map;
7263
}
7364

74-
@Override
75-
public Scanner scanner() {
76-
return scanner;
77-
}
78-
7965
private static void addDelimiterProcessors(Iterable<DelimiterProcessor> delimiterProcessors, Map<Character, DelimiterProcessor> map) {
8066
for (DelimiterProcessor delimiterProcessor : delimiterProcessors) {
8167
char opening = delimiterProcessor.getOpeningCharacter();
@@ -109,6 +95,40 @@ private static void addDelimiterProcessorForChar(char delimiterChar, DelimiterPr
10995
}
11096
}
11197

98+
private static BitSet calculateSpecialCharacters(Set<Character> delimiterCharacters,
99+
List<InlineContentParserFactory> inlineContentParserFactories) {
100+
BitSet bitSet = new BitSet();
101+
for (Character c : delimiterCharacters) {
102+
bitSet.set(c);
103+
}
104+
for (var factory : inlineContentParserFactories) {
105+
for (var c : factory.getTriggerCharacters()) {
106+
bitSet.set(c);
107+
}
108+
}
109+
bitSet.set('[');
110+
bitSet.set(']');
111+
bitSet.set('!');
112+
bitSet.set('\n');
113+
return bitSet;
114+
}
115+
116+
private Map<Character, List<InlineContentParser>> createInlineContentParsers() {
117+
var map = new HashMap<Character, List<InlineContentParser>>();
118+
for (var factory : inlineContentParserFactories) {
119+
var parser = factory.create();
120+
for (var c : factory.getTriggerCharacters()) {
121+
map.computeIfAbsent(c, k -> new ArrayList<>()).add(parser);
122+
}
123+
}
124+
return map;
125+
}
126+
127+
@Override
128+
public Scanner scanner() {
129+
return scanner;
130+
}
131+
112132
/**
113133
* Parse content in block into inline children, appending them to the block node.
114134
*/
@@ -117,14 +137,13 @@ public void parse(SourceLines lines, Node block) {
117137
reset(lines);
118138

119139
while (true) {
120-
List<? extends Node> nodes = parseInline();
121-
if (nodes != null) {
122-
for (Node node : nodes) {
123-
block.appendChild(node);
124-
}
125-
} else {
140+
var nodes = parseInline();
141+
if (nodes == null) {
126142
break;
127143
}
144+
for (Node node : nodes) {
145+
block.appendChild(node);
146+
}
128147
}
129148

130149
processDelimiters(null);
@@ -137,6 +156,7 @@ void reset(SourceLines lines) {
137156
this.trailingSpaces = 0;
138157
this.lastDelimiter = null;
139158
this.lastBracket = null;
159+
this.inlineParsers = createInlineContentParsers();
140160
}
141161

142162
private Text text(SourceLines sourceLines) {
@@ -155,20 +175,20 @@ private List<? extends Node> parseInline() {
155175

156176
switch (c) {
157177
case '[':
158-
return Collections.singletonList(parseOpenBracket());
178+
return List.of(parseOpenBracket());
159179
case '!':
160-
return Collections.singletonList(parseBang());
180+
return List.of(parseBang());
161181
case ']':
162-
return Collections.singletonList(parseCloseBracket());
182+
return List.of(parseCloseBracket());
163183
case '\n':
164-
return Collections.singletonList(parseLineBreak());
184+
return List.of(parseLineBreak());
165185
case Scanner.END:
166186
return null;
167187
}
168188

169189
// No inline parser, delimiter or other special handling.
170190
if (!specialCharacters.get(c)) {
171-
return Collections.singletonList(parseText());
191+
return List.of(parseText());
172192
}
173193

174194
List<InlineContentParser> inlineParsers = this.inlineParsers.get(c);
@@ -183,7 +203,7 @@ private List<? extends Node> parseInline() {
183203
if (includeSourceSpans && node.getSourceSpans().isEmpty()) {
184204
node.setSourceSpans(scanner.getSource(position, scanner.position()).getSourceSpans());
185205
}
186-
return Collections.singletonList(node);
206+
return List.of(node);
187207
} else {
188208
// Reset position
189209
scanner.setPosition(position);
@@ -200,7 +220,7 @@ private List<? extends Node> parseInline() {
200220
}
201221

202222
// If we get here, even for a special/delimiter character, we will just treat it as text.
203-
return Collections.singletonList(parseText());
223+
return List.of(parseText());
204224
}
205225

206226
/**

commonmark/src/main/java/org/commonmark/internal/inline/AutolinkInlineParser.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
import org.commonmark.node.Link;
44
import org.commonmark.node.Text;
55
import org.commonmark.parser.SourceLines;
6-
import org.commonmark.parser.beta.Position;
7-
import org.commonmark.parser.beta.Scanner;
6+
import org.commonmark.parser.beta.*;
87

8+
import java.util.Set;
99
import java.util.regex.Pattern;
1010

1111
/**
@@ -46,4 +46,16 @@ public ParsedInline tryParse(InlineParserState inlineParserState) {
4646
}
4747
return ParsedInline.none();
4848
}
49+
50+
public static class Factory implements InlineContentParserFactory {
51+
@Override
52+
public Set<Character> getTriggerCharacters() {
53+
return Set.of('<');
54+
}
55+
56+
@Override
57+
public InlineContentParser create() {
58+
return new AutolinkInlineParser();
59+
}
60+
}
4961
}

commonmark/src/main/java/org/commonmark/internal/inline/BackslashInlineParser.java

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
import org.commonmark.internal.util.Escaping;
44
import org.commonmark.node.HardLineBreak;
55
import org.commonmark.node.Text;
6-
import org.commonmark.parser.beta.Scanner;
6+
import org.commonmark.parser.beta.*;
77

8+
import java.util.Set;
89
import java.util.regex.Pattern;
910

1011
/**
@@ -32,4 +33,16 @@ public ParsedInline tryParse(InlineParserState inlineParserState) {
3233
return ParsedInline.of(new Text("\\"), scanner.position());
3334
}
3435
}
36+
37+
public static class Factory implements InlineContentParserFactory {
38+
@Override
39+
public Set<Character> getTriggerCharacters() {
40+
return Set.of('\\');
41+
}
42+
43+
@Override
44+
public InlineContentParser create() {
45+
return new BackslashInlineParser();
46+
}
47+
}
3548
}

commonmark/src/main/java/org/commonmark/internal/inline/BackticksInlineParser.java

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
import org.commonmark.node.Code;
44
import org.commonmark.node.Text;
55
import org.commonmark.parser.SourceLines;
6-
import org.commonmark.parser.beta.Position;
7-
import org.commonmark.parser.beta.Scanner;
6+
import org.commonmark.parser.beta.*;
87
import org.commonmark.text.Characters;
98

9+
import java.util.Set;
10+
1011
/**
1112
* Attempt to parse backticks, returning either a backtick code span or a literal sequence of backticks.
1213
*/
@@ -47,4 +48,16 @@ public ParsedInline tryParse(InlineParserState inlineParserState) {
4748
Text text = new Text(source.getContent());
4849
return ParsedInline.of(text, afterOpening);
4950
}
51+
52+
public static class Factory implements InlineContentParserFactory {
53+
@Override
54+
public Set<Character> getTriggerCharacters() {
55+
return Set.of('`');
56+
}
57+
58+
@Override
59+
public InlineContentParser create() {
60+
return new BackticksInlineParser();
61+
}
62+
}
5063
}

0 commit comments

Comments
 (0)