Skip to content

Commit 64b82bf

Browse files
committed
Lexicals now with SEPARATOR handling.
1 parent 2c6d5a8 commit 64b82bf

File tree

2 files changed

+240
-0
lines changed

2 files changed

+240
-0
lines changed

pljava-api/src/main/java/org/postgresql/pljava/sqlgen/Lexicals.java

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
import java.util.regex.Matcher;
1515
import java.util.regex.Pattern;
16+
import java.util.InputMismatchException;
1617

1718
import javax.annotation.processing.Messager;
1819
import javax.tools.Diagnostic.Kind;
@@ -196,6 +197,143 @@ public abstract class Lexicals
196197
"javaJavaIdentifier"
197198
));
198199

200+
/** A newline, in any of the various forms recognized by the Java regex
201+
* engine, letting it handle the details.
202+
*/
203+
public static final Pattern NEWLINE = Pattern.compile(
204+
"(?ms:$(?:(?<!^).|(?<=\\G).){1,2}+)"
205+
);
206+
207+
/** White space <em>except</em> newline, for any Java-recognized newline.
208+
*/
209+
public static final Pattern WHITESPACE_NO_NEWLINE = Pattern.compile(
210+
"(?-s:(?=\\s).)"
211+
);
212+
213+
/** The kind of comment that extends from -- to the end of the line.
214+
* This pattern does not eat the newline (though the ISO production does).
215+
*/
216+
public static final Pattern SIMPLE_COMMENT = Pattern.compile("(?-s:--.*+)");
217+
218+
/** Most of the inside of a bracketed comment, defined in an odd way.
219+
* It expects both characters of the /* introducer to have been consumed
220+
* already. This pattern will then eat the whole comment including both
221+
* closing characters <em>if</em> it encounters no nested comment;
222+
* otherwise it will consume everything including the / of the nested
223+
* introducer, but leaving the *, and the {@code <nest>} capturing group
224+
* will be present in the result. That signals the caller to increment the
225+
* nesting level, consume one * and invoke this pattern again. If the nested
226+
* match succeeds (without again setting the {@code <nest>} group), the
227+
* caller should then decrement the nest level and match this pattern again
228+
* to consume the rest of the comment at the original level.
229+
*<p>
230+
* This pattern leaves the * unconsumed upon finding a nested comment
231+
* introducer as a way to end the repetition in the SEPARATOR pattern, as
232+
* nothing the SEPARATOR pattern can match can begin with a *.
233+
*/
234+
public static final Pattern BRACKETED_COMMENT_INSIDE = Pattern.compile(
235+
"(?:(?:[^*/]++|/(?!\\*)|\\*(?!/))*+(?:\\*/|(?<nest>/(?=\\*))))"
236+
);
237+
238+
/** SQL's SEPARATOR, which can include any amount of whitespace, simple
239+
* comments, or bracketed comments. This pattern will consume as much of all
240+
* that as it can in one match. There are two capturing groups that might be
241+
* set in a match result: {@code <nl>} if there was at least one newline
242+
* matched among the whitespace (which needs to be known to get the
243+
* continuation of string literals right), and {@code <nest>} if the
244+
* start of a bracketed comment was encountered.
245+
*<p>
246+
* In the {@code <nest>} case, the / of the comment introducer will have
247+
* been consumed but the * will remain to consume (as described above
248+
* for BRACKETED_COMMENT_INSIDE); the caller will need to increment a nest
249+
* level, consume the *, and match BRACKETED_COMMENT_INSIDE to handle the
250+
* nesting comment. Assuming that completes without another {@code <nest>}
251+
* found, the level should be decremented and BRACKETED_COMMENT_INSIDE
252+
* matched again to match the rest of the outer comment. When that completes
253+
* (without a {@code <nest>}) at the outermost level, this pattern should be
254+
* matched again to mop up any remaining SEPARATOR content.
255+
*/
256+
public static final Pattern SEPARATOR =
257+
Pattern.compile(String.format(
258+
"(?:(?:%1$s++|(?<nl>%2$s))++|%3$s|(?<nest>/(?=\\*)))++",
259+
WHITESPACE_NO_NEWLINE.pattern(),
260+
NEWLINE.pattern(),
261+
SIMPLE_COMMENT.pattern()
262+
));
263+
264+
/**
265+
* Consume any SQL SEPARATOR at the beginning of {@code Matcher}
266+
* <em>m</em>'s current region.
267+
*<p>
268+
* The region start is advanced to the character following any separator
269+
* (or not at all, if no separator is found).
270+
*<p>
271+
* The meaning of the return value is altered by the <em>significant</em>
272+
* parameter: when <em>significant</em> is true (meaning the very presence
273+
* or absence of a separator is significant at that point in the grammar),
274+
* the result will be true if any separator was found, false otherwise.
275+
* When <em>significant</em> is false, the result does not reveal whether
276+
* any separator was found, but will be true only if a separator was found
277+
* that includes at least one newline. That information is needed for the
278+
* grammar of string and binary-string literals.
279+
* @param m a {@code Matcher} whose current region should have any separator
280+
* at the beginning consumed. The region start is advanced past any
281+
* separator found. The {@code Pattern} associated with the {@code Matcher}
282+
* may be changed.
283+
* @param significant when true, the result should report whether any
284+
* separator was found or not; when false, the result should report only
285+
* whether a separator containing at least one newline was found, or not.
286+
* @return whether any separator was found, or whether any separator
287+
* containing a newline was found, as selected by <em>significant</em>.
288+
* @throws InputMismatchException if an unclosed /*-style comment is found.
289+
*/
290+
public static boolean separator(Matcher m, boolean significant)
291+
{
292+
int state = 0;
293+
int level = 0;
294+
boolean result = false;
295+
296+
loop:
297+
for ( ;; )
298+
{
299+
switch ( state )
300+
{
301+
case 0:
302+
m.usePattern(SEPARATOR);
303+
if ( ! m.lookingAt() )
304+
return result; // leave matcher region alone
305+
if ( significant || -1 != m.start("nl") )
306+
result = true;
307+
if ( -1 != m.start("nest") )
308+
{
309+
m.region(m.end(0) + 1, m.regionEnd()); // + 1 to eat the *
310+
m.usePattern(BRACKETED_COMMENT_INSIDE);
311+
++ level;
312+
state = 1;
313+
continue;
314+
}
315+
state = 2; // advance matcher region, then break loop
316+
break;
317+
case 1:
318+
if ( ! m.lookingAt() )
319+
throw new InputMismatchException("unclosed comment");
320+
if ( -1 != m.start("nest") )
321+
{
322+
m.region(m.end(0) + 1, m.regionEnd()); // + 1 to eat the *
323+
++ level;
324+
continue;
325+
}
326+
else if ( 0 == -- level )
327+
state = 0;
328+
break;
329+
case 2:
330+
break loop;
331+
}
332+
m.region(m.end(0), m.regionEnd()); // advance past matched portion
333+
}
334+
return result;
335+
}
336+
199337
/**
200338
* Return an Identifier.Simple, given a {@code Matcher} that has matched an
201339
* ISO_AND_PG_IDENTIFIER_CAPTURING. Will determine from the matching named

pljava-api/src/test/java/LexicalsTest.java

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
package org.postgresql.pljava;
1313

1414
import java.util.regex.Matcher;
15+
import java.util.regex.Pattern;
16+
17+
import java.util.InputMismatchException;
1518

1619
import junit.framework.TestCase;
1720

@@ -20,14 +23,113 @@
2023

2124
import static
2225
org.postgresql.pljava.sqlgen.Lexicals.ISO_AND_PG_IDENTIFIER_CAPTURING;
26+
import static
27+
org.postgresql.pljava.sqlgen.Lexicals.SEPARATOR;
2328
import static org.postgresql.pljava.sqlgen.Lexicals.identifierFrom;
29+
import static org.postgresql.pljava.sqlgen.Lexicals.separator;
2430

2531
import org.postgresql.pljava.sqlgen.Lexicals.Identifier;
2632

2733
public class LexicalsTest extends TestCase
2834
{
2935
public LexicalsTest(String name) { super(name); }
3036

37+
public void testSeparator() throws Exception
38+
{
39+
Pattern allTheRest = Pattern.compile(".*", Pattern.DOTALL);
40+
41+
Matcher m = SEPARATOR.matcher("no starting separator");
42+
assertFalse("separator 0", separator(m, true));
43+
m.usePattern(allTheRest).matches();
44+
assertEquals("no starting separator", m.group(0));
45+
46+
m.reset();
47+
assertFalse("separator 1", separator(m, false));
48+
m.usePattern(allTheRest).matches();
49+
assertEquals("no starting separator", m.group(0));
50+
51+
m.reset(" simple separator");
52+
assertTrue("separator 2", separator(m, true));
53+
m.usePattern(allTheRest).matches();
54+
assertEquals("simple separator", m.group(0));
55+
56+
m.reset();
57+
assertFalse("separator 3", separator(m, false));
58+
m.usePattern(allTheRest).matches();
59+
assertEquals("simple separator", m.group(0));
60+
61+
m.reset(" \n simple separator");
62+
assertTrue("separator 4", separator(m, true));
63+
m.usePattern(allTheRest).matches();
64+
assertEquals("simple separator", m.group(0));
65+
66+
m.reset();
67+
assertTrue("separator 5", separator(m, false));
68+
m.usePattern(allTheRest).matches();
69+
assertEquals("simple separator", m.group(0));
70+
71+
m.reset(" -- a simple comment\nsimple comment");
72+
assertTrue("separator 6", separator(m, true));
73+
m.usePattern(allTheRest).matches();
74+
assertEquals("simple comment", m.group(0));
75+
76+
m.reset();
77+
assertTrue("separator 7", separator(m, false));
78+
m.usePattern(allTheRest).matches();
79+
assertEquals("simple comment", m.group(0));
80+
81+
m.reset("/* a bracketed comment\n */ bracketed comment");
82+
assertTrue("separator 8", separator(m, true));
83+
m.usePattern(allTheRest).matches();
84+
assertEquals("bracketed comment", m.group(0));
85+
86+
m.reset();
87+
assertFalse("separator 9", separator(m, false));
88+
m.usePattern(allTheRest).matches();
89+
assertEquals("bracketed comment", m.group(0));
90+
91+
m.reset("/* a /* nested */ comment\n */ nested comment");
92+
assertTrue("separator 10", separator(m, true));
93+
m.usePattern(allTheRest).matches();
94+
assertEquals("nested comment", m.group(0));
95+
96+
m.reset();
97+
assertFalse("separator 11", separator(m, false));
98+
m.usePattern(allTheRest).matches();
99+
assertEquals("nested comment", m.group(0));
100+
101+
m.reset("/* an /* unclosed */ comment\n * / unclosed comment");
102+
try
103+
{
104+
separator(m, true);
105+
fail("unclosed comment not detected");
106+
}
107+
catch ( Exception ex )
108+
{
109+
assertTrue("separator 12", ex instanceof InputMismatchException);
110+
}
111+
112+
m.reset("/* -- tricky \n */ nested comment");
113+
assertTrue("separator 13", separator(m, true));
114+
m.usePattern(allTheRest).matches();
115+
assertEquals("nested comment", m.group(0));
116+
117+
m.reset();
118+
assertFalse("separator 14", separator(m, false));
119+
m.usePattern(allTheRest).matches();
120+
assertEquals("nested comment", m.group(0));
121+
122+
m.reset("-- /* tricky \n */ nested comment");
123+
assertTrue("separator 15", separator(m, true));
124+
m.usePattern(allTheRest).matches();
125+
assertEquals("*/ nested comment", m.group(0));
126+
127+
m.reset();
128+
assertTrue("separator 16", separator(m, false));
129+
m.usePattern(allTheRest).matches();
130+
assertEquals("*/ nested comment", m.group(0));
131+
}
132+
31133
public void testIdentifierFrom() throws Exception
32134
{
33135
Matcher m = ISO_AND_PG_IDENTIFIER_CAPTURING.matcher("anIdentifier");

0 commit comments

Comments
 (0)