|
13 | 13 |
|
14 | 14 | import java.util.regex.Matcher; |
15 | 15 | import java.util.regex.Pattern; |
| 16 | +import java.util.InputMismatchException; |
16 | 17 |
|
17 | 18 | import javax.annotation.processing.Messager; |
18 | 19 | import javax.tools.Diagnostic.Kind; |
@@ -196,6 +197,143 @@ public abstract class Lexicals |
196 | 197 | "javaJavaIdentifier" |
197 | 198 | )); |
198 | 199 |
|
| 200 | + /** A newline, in any of the various forms recognized by the Java regex |
| 201 | + * engine, letting it handle the details. |
| 202 | + */ |
| 203 | + public static final Pattern NEWLINE = Pattern.compile( |
| 204 | + "(?ms:$(?:(?<!^).|(?<=\\G).){1,2}+)" |
| 205 | + ); |
| 206 | + |
| 207 | + /** White space <em>except</em> newline, for any Java-recognized newline. |
| 208 | + */ |
| 209 | + public static final Pattern WHITESPACE_NO_NEWLINE = Pattern.compile( |
| 210 | + "(?-s:(?=\\s).)" |
| 211 | + ); |
| 212 | + |
| 213 | + /** The kind of comment that extends from -- to the end of the line. |
| 214 | + * This pattern does not eat the newline (though the ISO production does). |
| 215 | + */ |
| 216 | + public static final Pattern SIMPLE_COMMENT = Pattern.compile("(?-s:--.*+)"); |
| 217 | + |
| 218 | + /** Most of the inside of a bracketed comment, defined in an odd way. |
| 219 | + * It expects both characters of the /* introducer to have been consumed |
| 220 | + * already. This pattern will then eat the whole comment including both |
| 221 | + * closing characters <em>if</em> it encounters no nested comment; |
| 222 | + * otherwise it will consume everything including the / of the nested |
| 223 | + * introducer, but leaving the *, and the {@code <nest>} capturing group |
| 224 | + * will be present in the result. That signals the caller to increment the |
| 225 | + * nesting level, consume one * and invoke this pattern again. If the nested |
| 226 | + * match succeeds (without again setting the {@code <nest>} group), the |
| 227 | + * caller should then decrement the nest level and match this pattern again |
| 228 | + * to consume the rest of the comment at the original level. |
| 229 | + *<p> |
| 230 | + * This pattern leaves the * unconsumed upon finding a nested comment |
| 231 | + * introducer as a way to end the repetition in the SEPARATOR pattern, as |
| 232 | + * nothing the SEPARATOR pattern can match can begin with a *. |
| 233 | + */ |
| 234 | + public static final Pattern BRACKETED_COMMENT_INSIDE = Pattern.compile( |
| 235 | + "(?:(?:[^*/]++|/(?!\\*)|\\*(?!/))*+(?:\\*/|(?<nest>/(?=\\*))))" |
| 236 | + ); |
| 237 | + |
| 238 | + /** SQL's SEPARATOR, which can include any amount of whitespace, simple |
| 239 | + * comments, or bracketed comments. This pattern will consume as much of all |
| 240 | + * that as it can in one match. There are two capturing groups that might be |
| 241 | + * set in a match result: {@code <nl>} if there was at least one newline |
| 242 | + * matched among the whitespace (which needs to be known to get the |
| 243 | + * continuation of string literals right), and {@code <nest>} if the |
| 244 | + * start of a bracketed comment was encountered. |
| 245 | + *<p> |
| 246 | + * In the {@code <nest>} case, the / of the comment introducer will have |
| 247 | + * been consumed but the * will remain to consume (as described above |
| 248 | + * for BRACKETED_COMMENT_INSIDE); the caller will need to increment a nest |
| 249 | + * level, consume the *, and match BRACKETED_COMMENT_INSIDE to handle the |
| 250 | + * nesting comment. Assuming that completes without another {@code <nest>} |
| 251 | + * found, the level should be decremented and BRACKETED_COMMENT_INSIDE |
| 252 | + * matched again to match the rest of the outer comment. When that completes |
| 253 | + * (without a {@code <nest>}) at the outermost level, this pattern should be |
| 254 | + * matched again to mop up any remaining SEPARATOR content. |
| 255 | + */ |
| 256 | + public static final Pattern SEPARATOR = |
| 257 | + Pattern.compile(String.format( |
| 258 | + "(?:(?:%1$s++|(?<nl>%2$s))++|%3$s|(?<nest>/(?=\\*)))++", |
| 259 | + WHITESPACE_NO_NEWLINE.pattern(), |
| 260 | + NEWLINE.pattern(), |
| 261 | + SIMPLE_COMMENT.pattern() |
| 262 | + )); |
| 263 | + |
| 264 | + /** |
| 265 | + * Consume any SQL SEPARATOR at the beginning of {@code Matcher} |
| 266 | + * <em>m</em>'s current region. |
| 267 | + *<p> |
| 268 | + * The region start is advanced to the character following any separator |
| 269 | + * (or not at all, if no separator is found). |
| 270 | + *<p> |
| 271 | + * The meaning of the return value is altered by the <em>significant</em> |
| 272 | + * parameter: when <em>significant</em> is true (meaning the very presence |
| 273 | + * or absence of a separator is significant at that point in the grammar), |
| 274 | + * the result will be true if any separator was found, false otherwise. |
| 275 | + * When <em>significant</em> is false, the result does not reveal whether |
| 276 | + * any separator was found, but will be true only if a separator was found |
| 277 | + * that includes at least one newline. That information is needed for the |
| 278 | + * grammar of string and binary-string literals. |
| 279 | + * @param m a {@code Matcher} whose current region should have any separator |
| 280 | + * at the beginning consumed. The region start is advanced past any |
| 281 | + * separator found. The {@code Pattern} associated with the {@code Matcher} |
| 282 | + * may be changed. |
| 283 | + * @param significant when true, the result should report whether any |
| 284 | + * separator was found or not; when false, the result should report only |
| 285 | + * whether a separator containing at least one newline was found, or not. |
| 286 | + * @return whether any separator was found, or whether any separator |
| 287 | + * containing a newline was found, as selected by <em>significant</em>. |
| 288 | + * @throws InputMismatchException if an unclosed /*-style comment is found. |
| 289 | + */ |
| 290 | + public static boolean separator(Matcher m, boolean significant) |
| 291 | + { |
| 292 | + int state = 0; |
| 293 | + int level = 0; |
| 294 | + boolean result = false; |
| 295 | + |
| 296 | + loop: |
| 297 | + for ( ;; ) |
| 298 | + { |
| 299 | + switch ( state ) |
| 300 | + { |
| 301 | + case 0: |
| 302 | + m.usePattern(SEPARATOR); |
| 303 | + if ( ! m.lookingAt() ) |
| 304 | + return result; // leave matcher region alone |
| 305 | + if ( significant || -1 != m.start("nl") ) |
| 306 | + result = true; |
| 307 | + if ( -1 != m.start("nest") ) |
| 308 | + { |
| 309 | + m.region(m.end(0) + 1, m.regionEnd()); // + 1 to eat the * |
| 310 | + m.usePattern(BRACKETED_COMMENT_INSIDE); |
| 311 | + ++ level; |
| 312 | + state = 1; |
| 313 | + continue; |
| 314 | + } |
| 315 | + state = 2; // advance matcher region, then break loop |
| 316 | + break; |
| 317 | + case 1: |
| 318 | + if ( ! m.lookingAt() ) |
| 319 | + throw new InputMismatchException("unclosed comment"); |
| 320 | + if ( -1 != m.start("nest") ) |
| 321 | + { |
| 322 | + m.region(m.end(0) + 1, m.regionEnd()); // + 1 to eat the * |
| 323 | + ++ level; |
| 324 | + continue; |
| 325 | + } |
| 326 | + else if ( 0 == -- level ) |
| 327 | + state = 0; |
| 328 | + break; |
| 329 | + case 2: |
| 330 | + break loop; |
| 331 | + } |
| 332 | + m.region(m.end(0), m.regionEnd()); // advance past matched portion |
| 333 | + } |
| 334 | + return result; |
| 335 | + } |
| 336 | + |
199 | 337 | /** |
200 | 338 | * Return an Identifier.Simple, given a {@code Matcher} that has matched an |
201 | 339 | * ISO_AND_PG_IDENTIFIER_CAPTURING. Will determine from the matching named |
|
0 commit comments