From bd199b7d8f0273a78b893ef4d810a1f9d4cc1764 Mon Sep 17 00:00:00 2001
From: Rasmus Lerchedahl Petersen <yoff@github.com>
Date: Mon, 10 May 2021 15:04:57 +0200
Subject: [PATCH 1/8] Python: Add parser

---
 python/ql/src/semmle/python/Parser.qll        | 445 ++++++++++++++++++
 python/ql/src/semmle/python/RegexLiteral.qll  |  27 ++
 .../src/semmle/python/RegexParserExtended.qll | 424 +++++++++++++++++
 3 files changed, 896 insertions(+)
 create mode 100644 python/ql/src/semmle/python/Parser.qll
 create mode 100644 python/ql/src/semmle/python/RegexLiteral.qll
 create mode 100644 python/ql/src/semmle/python/RegexParserExtended.qll

diff --git a/python/ql/src/semmle/python/Parser.qll b/python/ql/src/semmle/python/Parser.qll
new file mode 100644
index 000000000000..8876a7d86a5f
--- /dev/null
+++ b/python/ql/src/semmle/python/Parser.qll
@@ -0,0 +1,445 @@
+/**
+ * Parsing framework for QL
+ *
+ * Parser is performed as follows:
+ * 1. Search for all tokens using `ParsedString.tokens`
+ * 2. Perform a left-right tokenization, rejecting spurious tokens
+ * 3. Remove all tokens marked as whitespace or comments.
+ * 4. Put all tokens into sequence.
+ * 5. Perform a bottom-up parse of the text.
+ *
+ * The parsing algorithm is as follows:
+ * - all tokens are nodes.
+ * - for all pairs of adjacent nodes, merge them according to the rules of the grammar
+ *      specifed by `ParserConfiguraiton.rule(...)`
+ * - iterate until there are no more nodes to generate.
+ *
+ * Steps to implement a parser:
+ *
+ * 1) Implement a parser configuration by extending the `ParserConfiguration` class.
+ *
+ * 1a) Specify the tokenizer.
+ *
+ * Specify the tokenizer by providing regexes to match keywords and tokens in the language.
+ * override `ParserConfiguration.hasTokenRegex`, `ParserConfiguration.hasWhitespaceRegex`
+ * and `ParserConfiguration.hasCommentRegex`.
+ *
+ * For tokens like keywords, the id of the token/node is equal to the matched string.
+ *
+ * For tokens like identifiers, use `ParserConfiguration.hasTokenRegex/2` to specify the id of
+ * the matched token.
+ *
+ * 1b) Specify the grammar rules.
+ *
+ * Override `ParserConfiguration.rule(...)` to specify the grammar rules.
+ *
+ * 2) Extend the class `ParsedString` with strings that contain the text you want to parse.
+ * Initially, these will simply be test cases.
+ * Then, it can be literals from a snapshot.
+ * Then, it will be whole files from the universal extractor matching the right file extension.
+ *
+ * 3) Create QL classes for interesting nodes.  These will be of the form
+ * ```
+ *  abstract class ArithmeticExpr extends ExprNode {
+ *    // All nodes are binary. `this.getLeftNode()` is an intermediate node.
+ *    ExprNode getLeft() { result = this.getLeftNode().getLeftNode() }
+ *
+ *    ExprNode getRight() { result = this.getRightNode() }
+ *  }
+ *
+ * class SqlAddExpr extends ArithmeticExpr {
+ *   SqlAddExpr() { id="expr+expr" }  // The exact synthesized node id.
+ * }
+ * ```
+ */
+
+/**
+ * The configuration of a parser.
+ *
+ * Extend this class with each language you need to parse.
+ */
+abstract class ParserConfiguration extends string {
+  bindingset[this]
+  ParserConfiguration() { any() }
+
+  predicate hasFileExtension(string ext) { none() }
+
+  /** Tokens whose id is the same as the token text. */
+  predicate hasTokenRegex(string regex) { none() }
+
+  /** Whitespace tokens. */
+  predicate hasWhitespaceRegex(string regex) { none() }
+
+  /** Coment tokens. */
+  predicate hasCommentRegex(string regex) { none() }
+
+  /** Any other tokens not covered can map to a given token-id. */
+  predicate hasTokenRegex(string regex, string id) { none() }
+
+  /**
+   * Grammar rules of the form
+   *   result -> a
+   *
+   * The parser does not generate rules of id=`result`, but instead it
+   * searches for nodes of id=`a` when considering nodes to create.
+   */
+  string rule(string a) { none() }
+
+  /**
+   * Grammar rules of the form
+   *   result -> a b
+   *
+   * If the parser sees a node id=`a` next to a node of id=`b`,
+   * then the parser creates a node of id=`a+b`.
+   * Nodes of id=`a+b` are considered to be nodes of id=`result`
+   * when considering nodes to create.
+   */
+  string rule(string a, string b) { none() }
+
+  /**
+   * Grammar rules of the form
+   *   result -> a b c
+   */
+  string rule(string a, string b, string c) { none() }
+
+  private string convert(string a) {
+    result = rule(a)
+    or
+    exists(string a0, string b0, string c0 | result = rule(a0, b0, c0) and a = a0 + b0 + c0)
+    or
+    exists(string a0, string b0 | result = rule(a0, b0) and a = a0 + b0)
+  }
+
+  bindingset[fromKind]
+  private string convertS(string fromKind) {
+    result = fromKind
+    or
+    result = convert(fromKind)
+    or
+    result = convert(convert(fromKind))
+    or
+    result = convert(convert(convert(fromKind)))
+    or
+    result = convert(convert(convert(convert(fromKind))))
+  }
+
+  private string merge(string a, string b) {
+    exists(rule(a, b)) and result = a + b
+    or
+    exists(string a1, string b1, string c1 | exists(rule(a1, b1, c1)) |
+      a = a1 and b = b1 and result = a1 + b1
+      or
+      a = a1 + b1 and b = c1 and result = a1 + b1 + c1
+    )
+  }
+
+  predicate validSrc(string src) {
+    src = convert(_) or
+    exists(convert(src)) or
+    exists(merge(src, _)) or
+    exists(merge(_, src))
+  }
+
+  string convert2(string s) {
+    validSrc(s) and result = s
+    or
+    result = this.convert(s)
+    or
+    result = this.convert(convert2(s))
+  }
+
+  string merge2(string a, string b) { result = merge(convert2(a), convert2(b)) }
+
+  predicate hasInterpolationRegex(string regex, string id) { none() }
+}
+
+/**
+ * A string to be parsed.
+ */
+abstract class ParsedString extends string {
+  bindingset[this]
+  ParsedString() { any() }
+
+  abstract ParserConfiguration getConfiguration();
+
+  /**
+   * Gets the tokens in the string.
+   * Override this predicate to implement your tokenizer.
+   * `start` is the offset of the token in this string.
+   * `id` is a meaningful identifier.
+   */
+  cached
+  string tokens(int pos, string id) {
+    exists(ParserConfiguration config | config = this.getConfiguration() |
+      result = this.keywordToken(pos) and id = result.toUpperCase()
+      or
+      exists(string regex | config.hasWhitespaceRegex(regex) |
+        result = this.regexpFind(regex, _, pos) and id = "ws"
+      )
+      or
+      exists(string regex | config.hasCommentRegex(regex) |
+        result = this.regexpFind(regex, _, pos) and id = "comment"
+      )
+      or
+      exists(string regex | config.hasTokenRegex(regex, id) |
+        result = this.regexpFind(regex, _, pos) and
+        not result = this.keywordToken(pos)
+      )
+    )
+  }
+
+  // Shouldn't need to cache this
+  cached
+  private string keywordToken(int pos) {
+    exists(string regex | this.getConfiguration().hasTokenRegex(regex) |
+      result = this.regexpFind(regex, _, pos)
+    )
+  }
+
+  /**
+   * Gets the syntax nodes in this parsed string.
+   * override this predicate with the grammar rules.
+   */
+  abstract predicate getLocationInfo(
+    string file, int startLine, int startCol, int endLine, int endCol
+  );
+
+  // This is basically the parsing algorithm.
+  // - All tokens are nodes.
+  // - If you find two adjcacent nodes that can be merged, create a new node.
+  predicate nodes(int start, int next, string id) {
+    exists(tokenize(this, id, _, start)) and next = start + 1
+    or
+    exists(ParserConfiguration config, int mid, string id0, string id1 |
+      this.nodes(start, mid, id0) and
+      this.nodes(mid, next, id1) and
+      config = this.getConfiguration() and
+      id = config.merge2(id0, id1)
+      //id = config.merge(config.convertS(id0), config.convertS(id1))
+    )
+  }
+}
+
+private predicate lines(ParsedString str, int index, int line) {
+  line = 0 and index = 0
+  or
+  index = rank[line](int x | x = str.indexOf("\n") or x = str.length())
+}
+
+// Maps the position `pos` to a row and column
+// within the string `text`. This is used for computing
+// locations to nodes.
+private predicate rowCol(ParsedString str, int index, int line, int col) {
+  exists(int index1, int index2 |
+    lines(str, index1, line - 1) and
+    lines(str, index2, line) and
+    index in [index1 .. index2 - 1] and
+    col = index - index1
+  )
+}
+
+newtype TNode =
+  TNonterminalNode(ParsedString text, int startIndex, int endIndex, string id) {
+    text.nodes(startIndex, endIndex, id)
+  }
+
+/**
+ * Recursive predicate representing all nodes in the parse tree.
+ */
+predicate nodes(TNode node, ParsedString text, int start, int next, string id) {
+  node = TNonterminalNode(text, start, next, id)
+}
+
+/**
+ * A syntax node.
+ */
+class Node extends TNode {
+  ParsedString text;
+  int start;
+  int next;
+  string id;
+
+  /** Gets the token- or node- id of this node. */
+  string getId() { result = id }
+
+  /** Holds if this node is convertible to `toid`. */
+  predicate hasId(string toid) { toid = text.getConfiguration().convert2(id) }
+
+  /** Gets the offset of the text in the string. */
+  int getStartOffset() { exists(tokenize(text, _, result, start)) }
+
+  /** Gets the offset of the end of the text in the string. */
+  int getEndOffset1() { exists(int pos | result = tokenize(text, _, pos, next - 1).length() + pos) }
+
+  int getEndOffset2() {
+    // The offset of the end of the first token!
+    exists(int startPos | result = tokenize(text, _, startPos, start).length() + startPos)
+  }
+
+  int getEndOffset() {
+    // The end offset isn't the end of the last token
+    // if this node is an interpolated string.
+    if this.getEndOffset1() < this.getEndOffset2()
+    then result = this.getEndOffset2()
+    else result = this.getEndOffset1()
+  }
+
+  Node() { nodes(this, text, start, next, id) }
+
+  predicate isBefore(Node other) {
+    exists(int otherstart |
+      nodes(other, text, otherstart, _, _) and
+      start < otherstart
+    )
+  }
+
+  string toString() { result = this.getText() }
+
+  string getText() { result = text.substring(this.getStartOffset(), this.getEndOffset()) }
+
+  /**
+   * Creates a location for the node using the location of the text,
+   * then adjust the starts and ends based on `start` and `end`.
+   */
+  predicate hasLocationInfo(string file, int startLine, int startCol, int endLine, int endCol) {
+    exists(int line, int col |
+      text.getLocationInfo(file, line, col, _, _) and
+      nodeLocation(text, this.getStartOffset(), line, col, startLine, startCol) and
+      nodeLocation(text, this.getEndOffset() - 1, line, col, endLine, endCol)
+    )
+  }
+
+  predicate follows(Node previous) { nodes(previous, text, _, start, _) }
+
+  /**
+   * Gets the left child of this node, if any.
+   * All nodes are terminal or binary.
+   */
+  Node getLeftNode() { this.splits(result, _) }
+
+  /**
+   * Gets the right child of this node, if any.
+   * All nodes are terminal or binary.
+   */
+  Node getRightNode() { this.splits(_, result) }
+
+  predicate splits(Node left, Node right) {
+    exists(ParserConfiguration config, int mid, string id0, string id1 |
+      nodes(left, text, start, mid, id0) and
+      nodes(right, text, mid, next, id1) and
+      id = config.merge2(id0, id1)
+    )
+  }
+
+  /** Gets a child node of this node, if any. */
+  Node getAChildNode() { result = this.getLeftNode() or result = this.getRightNode() }
+
+  /** Gets the parent of this node, if any. */
+  Node getParent() { this = result.getAChildNode() }
+
+  /**
+   * Holds if this is the root node that spans the entire input.
+   * It is not sufficient to not have a parent: that could be a parsed fragment.
+   */
+  predicate isRoot() { start = 1 and next = getNumberOfTokens(text) + 1 }
+
+  /** Holds if this node has a path to the root node. */
+  predicate isRooted() { this.isRoot() or this.getParent().isRooted() }
+}
+
+/** A node that is a token. */
+class Token extends Node {
+  Token() { next = start + 1 }
+}
+
+pragma[noopt]
+private predicate nodeLocation(ParsedString text, int pos, int line0, int col0, int line, int col) {
+  text.getLocationInfo(_, line0, col0, _, _) and
+  exists(int l, int c | rowCol(text, pos, l, c) |
+    line = line0 + l and
+    col = col0 + c
+  )
+}
+
+/**
+ * Performs a tokenization of the source text, ensuring that
+ * tokens are contiguous to remove spurious tokens (e.g. contents of strings).
+ * This tokenization includes whitespace and comment tokens
+ * that we will filter out later (in `nonWs`).
+ */
+pragma[noopt]
+private string leftrightTokenize(ParsedString text, string id, int pos) {
+  result = longestToken(text, id, 0) and pos = 0
+  or
+  exists(string prevText, int prevPos, int prevLength |
+    prevText = leftrightTokenize(text, _, prevPos) and
+    prevLength = prevText.length() and
+    pos = prevPos + prevLength and
+    result = longestToken(text, id, pos)
+  )
+}
+
+private string interpolatedToken(ParsedString text, string id, int pos) {
+  exists(string regex | text.getConfiguration().hasInterpolationRegex(regex, id) |
+    result = text.regexpFind(regex, _, pos)
+  )
+}
+
+// Special handling of interpolated strings
+private string interpolatedStringTokens(ParsedString text, string id, int pos) {
+  exists(string interpolatedString, int start, int end |
+    interpolatedString = leftrightTokenize(text, "interpolatedstring", start) and
+    end = start + interpolatedString.length()
+  |
+    result = interpolatedToken(text, id, pos) and
+    pos > start and
+    pos < start + end - 1
+    //    pos in [start + 1, start + end - 1]
+  )
+}
+
+// Debugging predicate: Indicates which source strings are successfully tokenized.
+predicate successfullyTokenized(ParsedString text) { tokenizedLength(text) = text.length() }
+
+// Debugging predicate: Indicates which source strings have not been
+// successfully tokenized. Quick-eval this to see where the tokenizer has failed.
+predicate unsuccessfullyTokenized(ParsedString text, int length, string failedAt) {
+  length = tokenizedLength(text) and
+  not successfullyTokenized(text) and
+  failedAt = text.suffix(length)
+}
+
+// Gets the number of characters that were successfully tokenized in a source string.
+// This is useful to debug the tokenizer.
+int tokenizedLength(ParsedString text) {
+  exists(int maxpos |
+    maxpos = max(int pos | exists(leftrightTokenize(text, _, pos))) and
+    result = maxpos + leftrightTokenize(text, _, maxpos).length()
+  )
+}
+
+// Tidy up `tokens`. If the same position can have two different tokens,
+// pick the longest token.
+private string longestToken(ParsedString text, string id, int pos) {
+  result = text.tokens(pos, id) and
+  not text.tokens(pos, _).length() > result.length()
+}
+
+private string nonWs(ParsedString text, string id, int pos) {
+  result = leftrightTokenize(text, id, pos) and
+  (id != "ws" and id != "comment")
+  or
+  result = interpolatedStringTokens(text, id, pos)
+}
+
+/**
+ * Tokenizes the string left-right, removing whitespace and comments,
+ * and creates a rank `seq` for each token for all non-whitespace tokens.
+ */
+cached
+string tokenize(ParsedString text, string id, int pos, int seq) {
+  pos = rank[seq](int p | exists(nonWs(text, _, p)) | p) and
+  result = nonWs(text, id, pos)
+}
+
+int getNumberOfTokens(ParsedString text) { result = max(int n | exists(tokenize(text, _, _, n))) }
diff --git a/python/ql/src/semmle/python/RegexLiteral.qll b/python/ql/src/semmle/python/RegexLiteral.qll
new file mode 100644
index 000000000000..e6c324098321
--- /dev/null
+++ b/python/ql/src/semmle/python/RegexLiteral.qll
@@ -0,0 +1,27 @@
+import python
+import semmle.python.regex as R
+private import RegexParserExtended
+
+class RegexLiteralValue extends ParsedString {
+  R::Regex lit;
+
+  RegexLiteralValue() { this = lit.getText() }
+
+  override ParserConfiguration getConfiguration() { result instanceof RegexParserConfiguration }
+
+  override predicate getLocationInfo(
+    string file, int startline, int startcol, int endline, int endcol
+  ) {
+    lit.getLocation().hasLocationInfo(file, startline + 1, startcol - 2, endline + 1, endcol - 2)
+  }
+
+  R::Regex getLiteral() { result = lit }
+}
+
+class RegexLiteral extends R::Regex {
+  RegexLiteralValue val;
+
+  RegexLiteral() { val.getLiteral() = this }
+
+  Regex getRegex() { result.getText() = val and result.isRoot() }
+}
diff --git a/python/ql/src/semmle/python/RegexParserExtended.qll b/python/ql/src/semmle/python/RegexParserExtended.qll
new file mode 100644
index 000000000000..d6a6a009afed
--- /dev/null
+++ b/python/ql/src/semmle/python/RegexParserExtended.qll
@@ -0,0 +1,424 @@
+import Parser
+private import RegexLiteral
+
+module RegexSpecific {
+  predicate allowedEmptyClasses() { none() }
+}
+
+import RegexSpecific as Conf
+
+class RegexParserConfiguration extends ParserConfiguration {
+  RegexParserConfiguration() { this = "Extended regex parser configuration" }
+
+  override predicate hasTokenRegex(string regex) {
+    regex = "[()|*+?\\-\\[\\]]"
+    or
+    regex = "\\[\\^"
+  }
+
+  override predicate hasTokenRegex(string regex, string id) {
+    regex = "[^()|.$\\^\\[\\]\\\\]" and id = "normalchar"
+    or
+    regex = "\\\\[0-9]+" and id = "backref"
+    or
+    regex = "\\(\\?P=\\w+\\)" and id = "backref"
+    or
+    regex = "[.]" and id = "anychar"
+    or
+    regex = "[$]" and id = "dollar"
+    or
+    regex = "[\\^]" and id = "caret"
+    or
+    regex = "\\{[0-9]+\\}" and id = "fixedrepeat"
+    or
+    regex = "\\{,[0-9]+\\}" and id = "uptorepeat"
+    or
+    regex = "\\{[0-9]+,[0-9]+\\}" and id = "rangerepeat"
+    or
+    regex = "\\{[0-9]+,\\}" and id = "openrepeat"
+    or
+    regex = "\\\\[^AbBdDsSwWZafnNrtuUvx\\\\0-9]" and id = "normalchar"
+    or
+    regex = "\\\\[AbBdDsSwWZafnNrtuUvx\\\\]" and id = "escclass"
+    or
+    regex = "\\(\\?[aiLmsux]+\\)" and id = "confgroup"
+    or
+    regex = "\\(\\?:" and id = "("
+    or
+    regex = "\\(\\?[aiLmsux]*-[imsx]+:" and id = "("
+    or
+    regex = "\\(\\?#" and id = "(?#"
+    or
+    regex = "\\(\\?=" and id = "(?="
+    or
+    regex = "\\(\\?!" and id = "(?!"
+    or
+    regex = "\\(\\?<=" and id = "(?<="
+    or
+    regex = "\\(\\?<!" and id = "(?<!"
+    or
+    regex = "\\(\\?P<\\w+>" and id = "(named"
+  }
+
+  predicate testRegex() {
+    //  "(?P<n1>".regexpMatch("\\(\\?P<[:alnum:]+>")
+    "n1".regexpMatch("\\w+")
+  }
+
+  /*
+   * Use a proper unambiguous grammar for regexes:
+   *
+   * regex -> orregex
+   * orregex -> seqregex
+   * |      orregex '|' seqregex
+   * seqregex -> primary
+   * |      primary seqregex
+   * primary -> group
+   * |      primary *
+   * |      primary +
+   * |      char
+   * |      class
+   * |      escclass
+   * group -> '(' regex ')'
+   * |      '(?#' regex ')'
+   * |      '(?=' regex ')'
+   * |      '(?!' regex ')'
+   * |      '(?<=' regex ')'
+   * |      '(?<!' regex ')'
+   * |      '(named' regex ')'
+   * |      confgroup
+   * class -> '[' classinner ']'
+   * |      '[^' classinner ']'
+   * |      '[]'  if allowed empty classes
+   * |      '[^]' if allowed empty classes
+   * classinner -> classstart classinner1
+   * |      classstart
+   * classinner1 -> classinner2 '-'
+   * |      classinner2
+   * classinner2 -> classpart
+   * |      classpart classinner2
+   * classstart -> '-'
+   * |      ']' if not allowed empty classes
+   * |      classpart
+   * classpart -> normalchar
+   * |      classrange
+   * |      escclass
+   * classrange -> normalchar '-' normalchar
+   *
+   *
+   * Things that currently don't parse:
+   * - Empty regexes (as standalone empty strings, or part of a disjunction or group, e.g. `(a|)` or `()`)
+   * - Inline options, i.e. `(?s)`
+   * - Lookaheads/lookbehinds
+   * - Java specific: Nested character classes, intersecting character classes
+   *
+   * Things that parse but with the wrong semantics:
+   * - Possesive and reluctant quantifiers (`a*?` is treated as an optional regex with body `a*`)
+   * - Most escape sequences with special meanings (i.e. besides "quote the next character" or predefined character classes)
+   */
+
+  override string rule(string a) {
+    a in ["char", "anychar", "dollar", "caret", "backref", "class", "escclass", "group"] and
+    result = "primary"
+    or
+    a = "primary" and result = "seqregex"
+    or
+    a = "seqregex" and result = "orregex"
+    or
+    a = "orregex" and result = "regex"
+    or
+    a = "confgroup" and result = "group"
+    or
+    a in ["normalchar", "-", "]"] and
+    result = "char"
+    or
+    a in ["normalchar", "anychar", "()|+*?".charAt(_)] and result = "clschar"
+    or
+    a = "classstart" and result = "classinner"
+    or
+    a = "classinner2" and result = "classinner1"
+    or
+    a in ["classpart", "-"] and result = "classstart"
+    or
+    a = "classpart" and result = "classinner2"
+    or
+    a = "]" and not Conf::allowedEmptyClasses() and result = "classstart"
+    or
+    a in ["clschar", "classrange", "escclass"] and result = "classpart"
+  }
+
+  override string rule(string a, string b) {
+    a = "primary" and b = "seqregex" and result = "seqregex"
+    or
+    a = "primary" and b = "*" and result = "primary"
+    or
+    a = "primary" and b = "+" and result = "primary"
+    or
+    a = "primary" and b = "?" and result = "primary"
+    or
+    a = "primary" and b = "fixedrepeat" and result = "primary"
+    or
+    a = "primary" and b = "rangerepeat" and result = "primary"
+    or
+    a = "primary" and b = "uptorepeat" and result = "primary"
+    or
+    a = "primary" and b = "openrepeat" and result = "primary"
+    or
+    a in ["[", "[^"] and b = "]" and Conf::allowedEmptyClasses() and result = "class"
+    or
+    a = "classstart" and b = "classinner1" and result = "classinner"
+    or
+    a = "classpart" and b = "classinner2" and result = "classinner2"
+    or
+    a = "classinner2" and b = "-" and result = "classinner1"
+  }
+
+  override string rule(string a, string b, string c) {
+    a = "orregex" and b = "|" and c = "seqregex" and result = "orregex"
+    or
+    a = "(" and b = "regex" and c = ")" and result = "group"
+    or
+    a = "(?#" and b = "regex" and c = ")" and result = "group"
+    or
+    a = "(?=" and b = "regex" and c = ")" and result = "group"
+    or
+    a = "(?!" and b = "regex" and c = ")" and result = "group"
+    or
+    a = "(?<=" and b = "regex" and c = ")" and result = "group"
+    or
+    a = "(?<!" and b = "regex" and c = ")" and result = "group"
+    or
+    a = "(named" and b = "regex" and c = ")" and result = "group"
+    or
+    a = "[" and b = "classinner" and c = "]" and result = "class"
+    or
+    a = "[^" and b = "classinner" and c = "]" and result = "class"
+    or
+    a = "clschar" and b = "-" and c = "clschar" and result = "classrange"
+  }
+}
+
+class Regex extends Node {
+  Regex() { this.hasId("regex") }
+}
+
+bindingset[t]
+private string getChar(string t) {
+  t.length() = 1 and
+  result = t
+  or
+  exists(string c |
+    t.charAt(0) = "\\" and
+    t.charAt(1) = c and
+    result = c
+  )
+}
+
+class ChRegex extends Regex {
+  ChRegex() { this.hasId("char") and not this.getParent*() instanceof ClassRegex }
+
+  string getChar() { result = getChar(this.getText()) }
+}
+
+class ClassRegex extends Regex {
+  ClassRegex() { this.hasId("class") }
+
+  predicate isInverted() { this.getLeftNode().getLeftNode().hasId("[^") }
+}
+
+class EscapeClassRegex extends Regex {
+  EscapeClassRegex() { id = "escclass" }
+
+  string getClass() { result = getText().charAt(1) }
+}
+
+class ClassChar extends Node {
+  ClassRegex reg;
+
+  ClassChar() {
+    this.getParent*() = reg and
+    (
+      this.hasId("clschar")
+      or
+      this.getId() = "-" and
+      not this.getParent().getParent().getId() = "clschar-clschar"
+      or
+      this.getId() = "]" and
+      not this = reg.getRightNode()
+    )
+  }
+
+  string getChar() { result = getChar(this.getText()) }
+
+  ClassRegex getClass() { result = reg }
+}
+
+abstract class SpecialCharRegex extends Regex { }
+
+class DotRegex extends SpecialCharRegex {
+  DotRegex() { id = "anychar" and not this.getParent*() instanceof ClassRegex }
+}
+
+class DollarRegex extends SpecialCharRegex {
+  DollarRegex() { id = "dollar" and not this.getParent*() instanceof ClassRegex }
+}
+
+class CaretRegex extends SpecialCharRegex {
+  CaretRegex() { id = "caret" and not this.getParent*() instanceof ClassRegex }
+}
+
+class ClassRange extends Node {
+  ClassRange() { id = "clschar-clschar" }
+
+  ClassChar getLowerBound() { result = getLeftNode().getLeftNode() }
+
+  ClassChar getUpperBound() { result = getRightNode() }
+}
+
+class SequenceRegex extends Regex {
+  SequenceRegex() { id = "primaryseqregex" }
+
+  Regex getLeft() { result = this.getLeftNode() }
+
+  Regex getRight() { result = this.getRightNode() }
+}
+
+abstract class SuffixRegex extends Regex {
+  Regex getBody() { result = this.getLeftNode() }
+}
+
+abstract class UnboundedRegex extends SuffixRegex { }
+
+abstract class RepeatRegex extends SuffixRegex {
+  abstract int getLowerBound();
+
+  abstract int getUpperBound();
+}
+
+class StarRegex extends UnboundedRegex {
+  StarRegex() { id = "primary*" }
+}
+
+class PlusRegex extends UnboundedRegex {
+  PlusRegex() { id = "primary+" }
+}
+
+class FixedRepeatRegex extends SuffixRegex, RepeatRegex {
+  FixedRepeatRegex() { id = "primaryfixedrepeat" }
+
+  override int getLowerBound() {
+    exists(string suff, string num |
+      suff = getRightNode().getText() and
+      suff = "{" + num + "}" and
+      result = num.toInt()
+    )
+  }
+
+  override int getUpperBound() { result = getLowerBound() }
+}
+
+class UptoRepeatRegex extends SuffixRegex, RepeatRegex {
+  UptoRepeatRegex() { id = "primaryuptorepeat" }
+
+  override int getLowerBound() { result = 0 }
+
+  override int getUpperBound() {
+    exists(string suff, string num |
+      suff = getRightNode().getText() and
+      suff = "{," + num + "}" and
+      result = num.toInt()
+    )
+  }
+}
+
+class RangeRegex extends SuffixRegex, RepeatRegex {
+  RangeRegex() { id = "primaryrangerepeat" }
+
+  override int getLowerBound() {
+    exists(string suff, string numl |
+      suff = getRightNode().getText() and
+      numl = suff.regexpCapture("\\{([0-9]+),([0-9]+)\\}", 1) and
+      result = numl.toInt()
+    )
+  }
+
+  override int getUpperBound() {
+    exists(string suff, string numh |
+      suff = getRightNode().getText() and
+      numh = suff.regexpCapture("\\{([0-9]+),([0-9]+)\\}", 2) and
+      result = numh.toInt()
+    )
+  }
+}
+
+class OpenRepeatRegex extends UnboundedRegex, RepeatRegex {
+  OpenRepeatRegex() { id = "primaryopenrepeat" }
+
+  override int getLowerBound() {
+    exists(string suff, string num |
+      suff = getRightNode().getText() and
+      suff = "{" + num + ",}" and
+      result = num.toInt()
+    )
+  }
+
+  override int getUpperBound() { none() }
+}
+
+class OptionalRegex extends SuffixRegex {
+  OptionalRegex() { id = "primary?" }
+}
+
+class OrRegex extends Regex {
+  OrRegex() { id = "orregex|seqregex" }
+
+  Regex getLeft() { result = this.getLeftNode().getLeftNode() }
+
+  Regex getRight() { result = this.getRightNode() }
+}
+
+class CaptureRegex extends Regex {
+  CaptureRegex() { id = "(regex)" }
+
+  Regex getBody() { result = this.getLeftNode().getRightNode() }
+}
+
+class BackrefRegex extends Regex {
+  BackrefRegex() { this.hasId("backref") } //id = "backref" }
+}
+
+class GroupRegex extends Regex {
+  GroupRegex() { this.hasId("group") }
+}
+
+class ConfGroupRegex extends Regex {
+  ConfGroupRegex() { this.hasId("confgroup") }
+}
+
+abstract class AssertionGroupRegex extends GroupRegex { }
+
+class NegativeLookaheadRegex extends AssertionGroupRegex {
+  NegativeLookaheadRegex() { id = "(?!regex)" }
+}
+
+class NegativeLookbehindRegex extends AssertionGroupRegex {
+  NegativeLookbehindRegex() { id = "(?<!regex)" }
+}
+
+class PositiveLookaheadRegex extends AssertionGroupRegex {
+  PositiveLookaheadRegex() { id = "(?=regex)" }
+}
+
+class PositiveLookbehindRegex extends AssertionGroupRegex {
+  PositiveLookbehindRegex() { id = "(?<=regex)" }
+}
+
+class ParsedRegex extends Regex {
+  ParsedRegex() { this.isRoot() }
+}
+
+string testTokenize(ParsedString text, string id, int pos, int seq) {
+  // text.toString() = "\\A[+-]?\\d+" and
+  text.toString() = "\\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*)" and
+  result = tokenize(text, id, pos, seq)
+}

From 93c5896eebdbfd097dd2d216899eecd568e6aa9c Mon Sep 17 00:00:00 2001
From: Rasmus Lerchedahl Petersen <yoff@github.com>
Date: Mon, 10 May 2021 15:06:01 +0200
Subject: [PATCH 2/8] Python: Add regex parser tests

---
 .../regexparser/Alternation.expected          |  22 ++
 .../library-tests/regexparser/Alternation.ql  |   7 +
 .../regexparser/Characters.expected           | 134 ++++++++++
 .../library-tests/regexparser/Characters.ql   |  11 +
 .../regexparser/FirstLast.expected            | 103 ++++++++
 .../library-tests/regexparser/FirstLast.ql    |  12 +
 .../regexparser/GroupContents.expected        |  18 ++
 .../regexparser/GroupContents.ql              |   7 +
 .../library-tests/regexparser/Mode.expected   |  13 +
 .../ql/test/library-tests/regexparser/Mode.ql |   5 +
 .../regexparser/Qualified.expected            |  15 ++
 .../library-tests/regexparser/Qualified.ql    |   6 +
 .../library-tests/regexparser/Regex.expected  | 243 ++++++++++++++++++
 .../test/library-tests/regexparser/Regex.ql   |  52 ++++
 .../ql/test/library-tests/regexparser/test.py |  72 ++++++
 15 files changed, 720 insertions(+)
 create mode 100644 python/ql/test/library-tests/regexparser/Alternation.expected
 create mode 100644 python/ql/test/library-tests/regexparser/Alternation.ql
 create mode 100644 python/ql/test/library-tests/regexparser/Characters.expected
 create mode 100644 python/ql/test/library-tests/regexparser/Characters.ql
 create mode 100644 python/ql/test/library-tests/regexparser/FirstLast.expected
 create mode 100644 python/ql/test/library-tests/regexparser/FirstLast.ql
 create mode 100644 python/ql/test/library-tests/regexparser/GroupContents.expected
 create mode 100644 python/ql/test/library-tests/regexparser/GroupContents.ql
 create mode 100644 python/ql/test/library-tests/regexparser/Mode.expected
 create mode 100644 python/ql/test/library-tests/regexparser/Mode.ql
 create mode 100644 python/ql/test/library-tests/regexparser/Qualified.expected
 create mode 100644 python/ql/test/library-tests/regexparser/Qualified.ql
 create mode 100644 python/ql/test/library-tests/regexparser/Regex.expected
 create mode 100644 python/ql/test/library-tests/regexparser/Regex.ql
 create mode 100644 python/ql/test/library-tests/regexparser/test.py

diff --git a/python/ql/test/library-tests/regexparser/Alternation.expected b/python/ql/test/library-tests/regexparser/Alternation.expected
new file mode 100644
index 000000000000..2fe6572074e6
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/Alternation.expected
@@ -0,0 +1,22 @@
+| (?:(?:\n\r?)\|^)( *)\\S | 3 | 12 | (?:\n\r?)\|^ | 3 | 10 | (?:\n\r?) |
+| (?:(?:\n\r?)\|^)( *)\\S | 3 | 12 | (?:\n\r?)\|^ | 11 | 12 | ^ |
+| (?:(?P<n1>^(?:\|x))) | 14 | 16 | \|x | 14 | 14 |  |
+| (?:(?P<n1>^(?:\|x))) | 14 | 16 | \|x | 15 | 16 | x |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 3 | 9 | [^%]\|^ | 3 | 7 | [^%] |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 3 | 9 | [^%]\|^ | 8 | 9 | ^ |
+| (?P<name>[\\w]+)\| | 0 | 16 | (?P<name>[\\w]+)\| | 0 | 15 | (?P<name>[\\w]+) |
+| (?P<name>[\\w]+)\| | 0 | 16 | (?P<name>[\\w]+)\| | 16 | 16 |  |
+| (\\033\|~{) | 1 | 8 | \\033\|~{ | 1 | 5 | \\033 |
+| (\\033\|~{) | 1 | 8 | \\033\|~{ | 6 | 8 | ~{ |
+| \\\|\\[\\][123]\|\\{\\} | 0 | 16 | \\\|\\[\\][123]\|\\{\\} | 0 | 11 | \\\|\\[\\][123] |
+| \\\|\\[\\][123]\|\\{\\} | 0 | 16 | \\\|\\[\\][123]\|\\{\\} | 12 | 16 | \\{\\} |
+| \|x | 0 | 2 | \|x | 0 | 0 |  |
+| \|x | 0 | 2 | \|x | 1 | 2 | x |
+| ^(^y\|^z)(u$\|v$)$ | 2 | 7 | ^y\|^z | 2 | 4 | ^y |
+| ^(^y\|^z)(u$\|v$)$ | 2 | 7 | ^y\|^z | 5 | 7 | ^z |
+| ^(^y\|^z)(u$\|v$)$ | 9 | 14 | u$\|v$ | 9 | 11 | u$ |
+| ^(^y\|^z)(u$\|v$)$ | 9 | 14 | u$\|v$ | 12 | 14 | v$ |
+| x\| | 0 | 2 | x\| | 0 | 1 | x |
+| x\| | 0 | 2 | x\| | 2 | 2 |  |
+| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 0 | 1 | x |
+| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |
\ No newline at end of file
diff --git a/python/ql/test/library-tests/regexparser/Alternation.ql b/python/ql/test/library-tests/regexparser/Alternation.ql
new file mode 100644
index 000000000000..b369f822d4ab
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/Alternation.ql
@@ -0,0 +1,7 @@
+import python
+import semmle.python.regex
+
+from Regex r, int start, int end, int part_start, int part_end
+where r.alternationOption(start, end, part_start, part_end)
+select r.getText(), start, end, r.getText().substring(start, end), part_start, part_end,
+  r.getText().substring(part_start, part_end)
diff --git a/python/ql/test/library-tests/regexparser/Characters.expected b/python/ql/test/library-tests/regexparser/Characters.expected
new file mode 100644
index 000000000000..373a6bf7a662
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/Characters.expected
@@ -0,0 +1,134 @@
+| 012345678 | 0 | 1 |
+| 012345678 | 1 | 2 |
+| 012345678 | 2 | 3 |
+| 012345678 | 3 | 4 |
+| 012345678 | 4 | 5 |
+| 012345678 | 5 | 6 |
+| 012345678 | 6 | 7 |
+| 012345678 | 7 | 8 |
+| 012345678 | 8 | 9 |
+| (?!not-this)^[A-Z_]+$ | 3 | 4 |
+| (?!not-this)^[A-Z_]+$ | 4 | 5 |
+| (?!not-this)^[A-Z_]+$ | 5 | 6 |
+| (?!not-this)^[A-Z_]+$ | 6 | 7 |
+| (?!not-this)^[A-Z_]+$ | 7 | 8 |
+| (?!not-this)^[A-Z_]+$ | 8 | 9 |
+| (?!not-this)^[A-Z_]+$ | 9 | 10 |
+| (?!not-this)^[A-Z_]+$ | 10 | 11 |
+| (?!not-this)^[A-Z_]+$ | 12 | 13 |
+| (?!not-this)^[A-Z_]+$ | 14 | 15 |
+| (?!not-this)^[A-Z_]+$ | 16 | 17 |
+| (?!not-this)^[A-Z_]+$ | 17 | 18 |
+| (?!not-this)^[A-Z_]+$ | 20 | 21 |
+| (?:(?:\n\r?)\|^)( *)\\S | 6 | 7 |
+| (?:(?:\n\r?)\|^)( *)\\S | 7 | 8 |
+| (?:(?:\n\r?)\|^)( *)\\S | 11 | 12 |
+| (?:(?:\n\r?)\|^)( *)\\S | 14 | 15 |
+| (?:(?:\n\r?)\|^)( *)\\S | 17 | 19 |
+| (?:(?P<n1>^(?:\|x))) | 10 | 11 |
+| (?:(?P<n1>^(?:\|x))) | 15 | 16 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 5 | 6 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 8 | 9 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 11 | 12 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 12 | 14 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 15 | 17 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 19 | 21 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 22 | 23 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 24 | 25 |
+| (?P<name>[\\w]+)\| | 10 | 12 |
+| (?m)^(?!$) | 4 | 5 |
+| (?m)^(?!$) | 8 | 9 |
+| (\\033\|~{) | 1 | 5 |
+| (\\033\|~{) | 6 | 7 |
+| (\\033\|~{) | 7 | 8 |
+| [\ufffd-\ufffd] | 1 | 2 |
+| [\ufffd-\ufffd] | 3 | 4 |
+| [\ufffd-\ufffd][\ufffd-\ufffd] | 1 | 2 |
+| [\ufffd-\ufffd][\ufffd-\ufffd] | 3 | 4 |
+| [\ufffd-\ufffd][\ufffd-\ufffd] | 6 | 7 |
+| [\ufffd-\ufffd][\ufffd-\ufffd] | 8 | 9 |
+| []] | 1 | 2 |
+| [^-] | 2 | 3 |
+| [^A-Z] | 2 | 3 |
+| [^A-Z] | 4 | 5 |
+| [^]] | 2 | 3 |
+| \\A[+-]?\\d+ | 0 | 2 |
+| \\A[+-]?\\d+ | 3 | 4 |
+| \\A[+-]?\\d+ | 4 | 5 |
+| \\A[+-]?\\d+ | 7 | 9 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 0 | 2 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 12 | 13 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 16 | 18 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 18 | 20 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 30 | 31 |
+| \\\|\\[\\][123]\|\\{\\} | 0 | 2 |
+| \\\|\\[\\][123]\|\\{\\} | 2 | 4 |
+| \\\|\\[\\][123]\|\\{\\} | 4 | 6 |
+| \\\|\\[\\][123]\|\\{\\} | 7 | 8 |
+| \\\|\\[\\][123]\|\\{\\} | 8 | 9 |
+| \\\|\\[\\][123]\|\\{\\} | 9 | 10 |
+| \\\|\\[\\][123]\|\\{\\} | 12 | 14 |
+| \\\|\\[\\][123]\|\\{\\} | 14 | 16 |
+| \|x | 1 | 2 |
+| ^(^y\|^z)(u$\|v$)$ | 0 | 1 |
+| ^(^y\|^z)(u$\|v$)$ | 2 | 3 |
+| ^(^y\|^z)(u$\|v$)$ | 3 | 4 |
+| ^(^y\|^z)(u$\|v$)$ | 5 | 6 |
+| ^(^y\|^z)(u$\|v$)$ | 6 | 7 |
+| ^(^y\|^z)(u$\|v$)$ | 9 | 10 |
+| ^(^y\|^z)(u$\|v$)$ | 10 | 11 |
+| ^(^y\|^z)(u$\|v$)$ | 12 | 13 |
+| ^(^y\|^z)(u$\|v$)$ | 13 | 14 |
+| ^(^y\|^z)(u$\|v$)$ | 15 | 16 |
+| ^.$ | 0 | 1 |
+| ^.$ | 1 | 2 |
+| ^.$ | 2 | 3 |
+| ^[A-Z_]+$(?<!not-this) | 0 | 1 |
+| ^[A-Z_]+$(?<!not-this) | 2 | 3 |
+| ^[A-Z_]+$(?<!not-this) | 4 | 5 |
+| ^[A-Z_]+$(?<!not-this) | 5 | 6 |
+| ^[A-Z_]+$(?<!not-this) | 8 | 9 |
+| ^[A-Z_]+$(?<!not-this) | 13 | 14 |
+| ^[A-Z_]+$(?<!not-this) | 14 | 15 |
+| ^[A-Z_]+$(?<!not-this) | 15 | 16 |
+| ^[A-Z_]+$(?<!not-this) | 16 | 17 |
+| ^[A-Z_]+$(?<!not-this) | 17 | 18 |
+| ^[A-Z_]+$(?<!not-this) | 18 | 19 |
+| ^[A-Z_]+$(?<!not-this) | 19 | 20 |
+| ^[A-Z_]+$(?<!not-this) | 20 | 21 |
+| ax{01,3} | 0 | 1 |
+| ax{01,3} | 1 | 2 |
+| ax{01,3} | 3 | 4 |
+| ax{01,3} | 4 | 5 |
+| ax{01,3} | 5 | 6 |
+| ax{01,3} | 6 | 7 |
+| ax{01,3} | 7 | 8 |
+| ax{3,} | 0 | 1 |
+| ax{3,} | 1 | 2 |
+| ax{3,} | 3 | 4 |
+| ax{3,} | 4 | 5 |
+| ax{3,} | 5 | 6 |
+| ax{3} | 0 | 1 |
+| ax{3} | 1 | 2 |
+| ax{3} | 3 | 4 |
+| ax{3} | 4 | 5 |
+| ax{,3} | 0 | 1 |
+| ax{,3} | 1 | 2 |
+| ax{,3} | 3 | 4 |
+| ax{,3} | 4 | 5 |
+| ax{,3} | 5 | 6 |
+| x\| | 0 | 1 |
+| x\|(?<!\\w)l | 0 | 1 |
+| x\|(?<!\\w)l | 6 | 8 |
+| x\|(?<!\\w)l | 9 | 10 |
+| x{Not qual} | 0 | 1 |
+| x{Not qual} | 1 | 2 |
+| x{Not qual} | 2 | 3 |
+| x{Not qual} | 3 | 4 |
+| x{Not qual} | 4 | 5 |
+| x{Not qual} | 5 | 6 |
+| x{Not qual} | 6 | 7 |
+| x{Not qual} | 7 | 8 |
+| x{Not qual} | 8 | 9 |
+| x{Not qual} | 9 | 10 |
+| x{Not qual} | 10 | 11 |
diff --git a/python/ql/test/library-tests/regexparser/Characters.ql b/python/ql/test/library-tests/regexparser/Characters.ql
new file mode 100644
index 000000000000..1444c37cd57d
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/Characters.ql
@@ -0,0 +1,11 @@
+/**
+ * @name Escaped
+ * @description Test for escaped characters
+ */
+
+import python
+import semmle.python.regex
+
+from Regex r, int start, int end
+where r.character(start, end) and r.getLocation().getFile().getBaseName() = "test.py"
+select r.getText(), start, end
diff --git a/python/ql/test/library-tests/regexparser/FirstLast.expected b/python/ql/test/library-tests/regexparser/FirstLast.expected
new file mode 100644
index 000000000000..974504bfc4a2
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/FirstLast.expected
@@ -0,0 +1,103 @@
+| 012345678 | first | 0 | 1 |
+| 012345678 | last | 8 | 9 |
+| (?!not-this)^[A-Z_]+$ | first | 3 | 4 |
+| (?!not-this)^[A-Z_]+$ | first | 12 | 13 |
+| (?!not-this)^[A-Z_]+$ | first | 13 | 19 |
+| (?!not-this)^[A-Z_]+$ | first | 13 | 20 |
+| (?!not-this)^[A-Z_]+$ | last | 13 | 19 |
+| (?!not-this)^[A-Z_]+$ | last | 13 | 20 |
+| (?!not-this)^[A-Z_]+$ | last | 20 | 21 |
+| (?:(?:\n\r?)\|^)( *)\\S | first | 6 | 7 |
+| (?:(?:\n\r?)\|^)( *)\\S | first | 11 | 12 |
+| (?:(?:\n\r?)\|^)( *)\\S | last | 17 | 19 |
+| (?:(?P<n1>^(?:\|x))) | first | 10 | 11 |
+| (?:(?P<n1>^(?:\|x))) | first | 15 | 16 |
+| (?:(?P<n1>^(?:\|x))) | last | 15 | 16 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | first | 0 | 11 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | first | 3 | 7 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | first | 8 | 9 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | first | 11 | 12 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | last | 21 | 26 |
+| (?P<name>[\\w]+)\| | first | 9 | 13 |
+| (?P<name>[\\w]+)\| | first | 9 | 14 |
+| (?P<name>[\\w]+)\| | last | 9 | 13 |
+| (?P<name>[\\w]+)\| | last | 9 | 14 |
+| (?m)^(?!$) | first | 4 | 5 |
+| (?m)^(?!$) | first | 8 | 9 |
+| (?m)^(?!$) | last | 4 | 5 |
+| (?m)^(?!$) | last | 8 | 9 |
+| (\\033\|~{) | first | 1 | 5 |
+| (\\033\|~{) | first | 6 | 7 |
+| (\\033\|~{) | last | 1 | 5 |
+| (\\033\|~{) | last | 7 | 8 |
+| [\ufffd-\ufffd] | first | 0 | 5 |
+| [\ufffd-\ufffd] | last | 0 | 5 |
+| [\ufffd-\ufffd][\ufffd-\ufffd] | first | 0 | 5 |
+| [\ufffd-\ufffd][\ufffd-\ufffd] | last | 5 | 10 |
+| []] | first | 0 | 3 |
+| []] | last | 0 | 3 |
+| [^-] | first | 0 | 4 |
+| [^-] | last | 0 | 4 |
+| [^A-Z] | first | 0 | 6 |
+| [^A-Z] | last | 0 | 6 |
+| [^]] | first | 0 | 4 |
+| [^]] | last | 0 | 4 |
+| \\A[+-]?\\d+ | first | 0 | 2 |
+| \\A[+-]?\\d+ | last | 7 | 9 |
+| \\A[+-]?\\d+ | last | 7 | 10 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | first | 0 | 2 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | last | 28 | 32 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | last | 28 | 33 |
+| \\\|\\[\\][123]\|\\{\\} | first | 0 | 2 |
+| \\\|\\[\\][123]\|\\{\\} | first | 12 | 14 |
+| \\\|\\[\\][123]\|\\{\\} | last | 6 | 11 |
+| \\\|\\[\\][123]\|\\{\\} | last | 14 | 16 |
+| \|x | first | 1 | 2 |
+| \|x | last | 1 | 2 |
+| ^(^y\|^z)(u$\|v$)$ | first | 0 | 1 |
+| ^(^y\|^z)(u$\|v$)$ | first | 2 | 3 |
+| ^(^y\|^z)(u$\|v$)$ | first | 3 | 4 |
+| ^(^y\|^z)(u$\|v$)$ | first | 5 | 6 |
+| ^(^y\|^z)(u$\|v$)$ | first | 6 | 7 |
+| ^(^y\|^z)(u$\|v$)$ | last | 9 | 10 |
+| ^(^y\|^z)(u$\|v$)$ | last | 10 | 11 |
+| ^(^y\|^z)(u$\|v$)$ | last | 12 | 13 |
+| ^(^y\|^z)(u$\|v$)$ | last | 13 | 14 |
+| ^(^y\|^z)(u$\|v$)$ | last | 15 | 16 |
+| ^.$ | first | 0 | 1 |
+| ^.$ | first | 1 | 2 |
+| ^.$ | last | 1 | 2 |
+| ^.$ | last | 2 | 3 |
+| ^[A-Z_]+$(?<!not-this) | first | 0 | 1 |
+| ^[A-Z_]+$(?<!not-this) | first | 1 | 7 |
+| ^[A-Z_]+$(?<!not-this) | first | 1 | 8 |
+| ^[A-Z_]+$(?<!not-this) | last | 1 | 7 |
+| ^[A-Z_]+$(?<!not-this) | last | 1 | 8 |
+| ^[A-Z_]+$(?<!not-this) | last | 8 | 9 |
+| ^[A-Z_]+$(?<!not-this) | last | 20 | 21 |
+| ax{01,3} | first | 0 | 1 |
+| ax{01,3} | last | 1 | 2 |
+| ax{01,3} | last | 1 | 8 |
+| ax{01,3} | last | 7 | 8 |
+| ax{3,} | first | 0 | 1 |
+| ax{3,} | last | 1 | 2 |
+| ax{3,} | last | 1 | 6 |
+| ax{3,} | last | 5 | 6 |
+| ax{3} | first | 0 | 1 |
+| ax{3} | last | 1 | 2 |
+| ax{3} | last | 1 | 5 |
+| ax{3} | last | 4 | 5 |
+| ax{,3} | first | 0 | 1 |
+| ax{,3} | last | 0 | 1 |
+| ax{,3} | last | 1 | 2 |
+| ax{,3} | last | 1 | 6 |
+| ax{,3} | last | 5 | 6 |
+| x\| | first | 0 | 1 |
+| x\| | last | 0 | 1 |
+| x\|(?<!\\w)l | first | 0 | 1 |
+| x\|(?<!\\w)l | first | 6 | 8 |
+| x\|(?<!\\w)l | first | 9 | 10 |
+| x\|(?<!\\w)l | last | 0 | 1 |
+| x\|(?<!\\w)l | last | 9 | 10 |
+| x{Not qual} | first | 0 | 1 |
+| x{Not qual} | last | 10 | 11 |
diff --git a/python/ql/test/library-tests/regexparser/FirstLast.ql b/python/ql/test/library-tests/regexparser/FirstLast.ql
new file mode 100644
index 000000000000..5bca6fdf5424
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/FirstLast.ql
@@ -0,0 +1,12 @@
+import python
+import semmle.python.regex
+
+predicate part(Regex r, int start, int end, string kind) {
+  r.lastItem(start, end) and kind = "last"
+  or
+  r.firstItem(start, end) and kind = "first"
+}
+
+from Regex r, int start, int end, string kind
+where part(r, start, end, kind) and r.getLocation().getFile().getBaseName() = "test.py"
+select r.getText(), kind, start, end
diff --git a/python/ql/test/library-tests/regexparser/GroupContents.expected b/python/ql/test/library-tests/regexparser/GroupContents.expected
new file mode 100644
index 000000000000..c7c4ac97a1e3
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/GroupContents.expected
@@ -0,0 +1,18 @@
+| (?!not-this)^[A-Z_]+$ | 0 | 12 | (?!not-this) | 3 | 11 | not-this |
+| (?:(?:\n\r?)\|^)( *)\\S | 0 | 13 | (?:(?:\n\r?)\|^) | 3 | 12 | (?:\n\r?)\|^ |
+| (?:(?:\n\r?)\|^)( *)\\S | 3 | 10 | (?:\n\r?) | 6 | 9 | \n\r? |
+| (?:(?:\n\r?)\|^)( *)\\S | 13 | 17 | ( *) | 14 | 16 |  * |
+| (?:(?P<n1>^(?:\|x))) | 0 | 19 | (?:(?P<n1>^(?:\|x))) | 3 | 18 | (?P<n1>^(?:\|x)) |
+| (?:(?P<n1>^(?:\|x))) | 3 | 18 | (?P<n1>^(?:\|x)) | 10 | 17 | ^(?:\|x) |
+| (?:(?P<n1>^(?:\|x))) | 11 | 17 | (?:\|x) | 14 | 16 | \|x |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 0 | 10 | (?:[^%]\|^) | 3 | 9 | [^%]\|^ |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 14 | 19 | (\\w*) | 15 | 18 | \\w* |
+| (?P<name>[\\w]+)\| | 0 | 15 | (?P<name>[\\w]+) | 9 | 14 | [\\w]+ |
+| (?m)^(?!$) | 5 | 10 | (?!$) | 8 | 9 | $ |
+| (\\033\|~{) | 0 | 9 | (\\033\|~{) | 1 | 8 | \\033\|~{ |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 2 | 16 | (?P<txt>[^[]*) | 10 | 15 | [^[]* |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 20 | 34 | (?P<uri>[^)]*) | 28 | 33 | [^)]* |
+| ^(^y\|^z)(u$\|v$)$ | 1 | 8 | (^y\|^z) | 2 | 7 | ^y\|^z |
+| ^(^y\|^z)(u$\|v$)$ | 8 | 15 | (u$\|v$) | 9 | 14 | u$\|v$ |
+| ^[A-Z_]+$(?<!not-this) | 9 | 22 | (?<!not-this) | 13 | 21 | not-this |
+| x\|(?<!\\w)l | 2 | 9 | (?<!\\w) | 6 | 8 | \\w |
diff --git a/python/ql/test/library-tests/regexparser/GroupContents.ql b/python/ql/test/library-tests/regexparser/GroupContents.ql
new file mode 100644
index 000000000000..4fd7d9d229ef
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/GroupContents.ql
@@ -0,0 +1,7 @@
+import python
+import semmle.python.regex
+
+from Regex r, int start, int end, int part_start, int part_end
+where r.groupContents(start, end, part_start, part_end)
+select r.getText(), start, end, r.getText().substring(start, end), part_start, part_end,
+  r.getText().substring(part_start, part_end)
diff --git a/python/ql/test/library-tests/regexparser/Mode.expected b/python/ql/test/library-tests/regexparser/Mode.expected
new file mode 100644
index 000000000000..3fcfc8672c1b
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/Mode.expected
@@ -0,0 +1,13 @@
+| 11 | MULTILINE |
+| 47 | VERBOSE |
+| 48 | VERBOSE |
+| 49 | DOTALL |
+| 49 | VERBOSE |
+| 50 | IGNORECASE |
+| 50 | VERBOSE |
+| 51 | UNICODE |
+| 52 | UNICODE |
+| 54 | DOTALL |
+| 54 | VERBOSE |
+| 56 | VERBOSE |
+| 68 | MULTILINE |
diff --git a/python/ql/test/library-tests/regexparser/Mode.ql b/python/ql/test/library-tests/regexparser/Mode.ql
new file mode 100644
index 000000000000..02e84f86c5db
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/Mode.ql
@@ -0,0 +1,5 @@
+import python
+import semmle.python.regex
+
+from Regex r
+select r.getLocation().getStartLine(), r.getAMode()
diff --git a/python/ql/test/library-tests/regexparser/Qualified.expected b/python/ql/test/library-tests/regexparser/Qualified.expected
new file mode 100644
index 000000000000..30019f943eb9
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/Qualified.expected
@@ -0,0 +1,15 @@
+| (?!not-this)^[A-Z_]+$ | 13 | 20 | false |
+| (?:(?:\n\r?)\|^)( *)\\S | 7 | 9 | true |
+| (?:(?:\n\r?)\|^)( *)\\S | 14 | 16 | true |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 0 | 11 | true |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 15 | 18 | true |
+| (?P<name>[\\w]+)\| | 9 | 14 | false |
+| \\A[+-]?\\d+ | 2 | 7 | true |
+| \\A[+-]?\\d+ | 7 | 10 | false |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 10 | 15 | true |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | 28 | 33 | true |
+| ^[A-Z_]+$(?<!not-this) | 1 | 8 | false |
+| ax{01,3} | 1 | 8 | false |
+| ax{3,} | 1 | 6 | false |
+| ax{3} | 1 | 5 | false |
+| ax{,3} | 1 | 6 | true |
diff --git a/python/ql/test/library-tests/regexparser/Qualified.ql b/python/ql/test/library-tests/regexparser/Qualified.ql
new file mode 100644
index 000000000000..8adf7e0426c2
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/Qualified.ql
@@ -0,0 +1,6 @@
+import python
+import semmle.python.regex
+
+from Regex r, int start, int end, boolean maybe_empty
+where r.qualifiedItem(start, end, maybe_empty)
+select r.getText(), start, end, maybe_empty
diff --git a/python/ql/test/library-tests/regexparser/Regex.expected b/python/ql/test/library-tests/regexparser/Regex.expected
new file mode 100644
index 000000000000..58966f0831fb
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/Regex.expected
@@ -0,0 +1,243 @@
+| 012345678 | char | 0 | 1 |
+| 012345678 | char | 1 | 2 |
+| 012345678 | char | 2 | 3 |
+| 012345678 | char | 3 | 4 |
+| 012345678 | char | 4 | 5 |
+| 012345678 | char | 5 | 6 |
+| 012345678 | char | 6 | 7 |
+| 012345678 | char | 7 | 8 |
+| 012345678 | char | 8 | 9 |
+| 012345678 | sequence | 0 | 9 |
+| (?!not-this)^[A-Z_]+$ | $ | 20 | 21 |
+| (?!not-this)^[A-Z_]+$ | ^ | 12 | 13 |
+| (?!not-this)^[A-Z_]+$ | char | 3 | 4 |
+| (?!not-this)^[A-Z_]+$ | char | 4 | 5 |
+| (?!not-this)^[A-Z_]+$ | char | 5 | 6 |
+| (?!not-this)^[A-Z_]+$ | char | 6 | 7 |
+| (?!not-this)^[A-Z_]+$ | char | 7 | 8 |
+| (?!not-this)^[A-Z_]+$ | char | 8 | 9 |
+| (?!not-this)^[A-Z_]+$ | char | 9 | 10 |
+| (?!not-this)^[A-Z_]+$ | char | 10 | 11 |
+| (?!not-this)^[A-Z_]+$ | char | 14 | 15 |
+| (?!not-this)^[A-Z_]+$ | char | 16 | 17 |
+| (?!not-this)^[A-Z_]+$ | char | 17 | 18 |
+| (?!not-this)^[A-Z_]+$ | char-set | 13 | 19 |
+| (?!not-this)^[A-Z_]+$ | empty group | 0 | 12 |
+| (?!not-this)^[A-Z_]+$ | qualified | 13 | 20 |
+| (?!not-this)^[A-Z_]+$ | sequence | 0 | 21 |
+| (?!not-this)^[A-Z_]+$ | sequence | 3 | 11 |
+| (?:(?:\n\r?)\|^)( *)\\S | ^ | 11 | 12 |
+| (?:(?:\n\r?)\|^)( *)\\S | char | 6 | 7 |
+| (?:(?:\n\r?)\|^)( *)\\S | char | 7 | 8 |
+| (?:(?:\n\r?)\|^)( *)\\S | char | 14 | 15 |
+| (?:(?:\n\r?)\|^)( *)\\S | char | 17 | 19 |
+| (?:(?:\n\r?)\|^)( *)\\S | choice | 3 | 12 |
+| (?:(?:\n\r?)\|^)( *)\\S | non-empty group | 0 | 13 |
+| (?:(?:\n\r?)\|^)( *)\\S | non-empty group | 3 | 10 |
+| (?:(?:\n\r?)\|^)( *)\\S | non-empty group | 13 | 17 |
+| (?:(?:\n\r?)\|^)( *)\\S | qualified | 7 | 9 |
+| (?:(?:\n\r?)\|^)( *)\\S | qualified | 14 | 16 |
+| (?:(?:\n\r?)\|^)( *)\\S | sequence | 0 | 19 |
+| (?:(?:\n\r?)\|^)( *)\\S | sequence | 3 | 10 |
+| (?:(?:\n\r?)\|^)( *)\\S | sequence | 6 | 9 |
+| (?:(?:\n\r?)\|^)( *)\\S | sequence | 11 | 12 |
+| (?:(?P<n1>^(?:\|x))) | ^ | 10 | 11 |
+| (?:(?P<n1>^(?:\|x))) | char | 15 | 16 |
+| (?:(?P<n1>^(?:\|x))) | choice | 14 | 16 |
+| (?:(?P<n1>^(?:\|x))) | non-empty group | 0 | 19 |
+| (?:(?P<n1>^(?:\|x))) | non-empty group | 3 | 18 |
+| (?:(?P<n1>^(?:\|x))) | non-empty group | 11 | 17 |
+| (?:(?P<n1>^(?:\|x))) | sequence | 0 | 19 |
+| (?:(?P<n1>^(?:\|x))) | sequence | 3 | 18 |
+| (?:(?P<n1>^(?:\|x))) | sequence | 10 | 17 |
+| (?:(?P<n1>^(?:\|x))) | sequence | 15 | 16 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | ^ | 8 | 9 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char | 5 | 6 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char | 11 | 12 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char | 12 | 14 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char | 15 | 17 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char | 19 | 21 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char | 22 | 23 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char | 24 | 25 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char-set | 3 | 7 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char-set | 21 | 26 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | choice | 3 | 9 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | non-empty group | 0 | 10 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | non-empty group | 14 | 19 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | qualified | 0 | 11 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | qualified | 15 | 18 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | sequence | 0 | 26 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | sequence | 3 | 7 |
+| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | sequence | 8 | 9 |
+| (?P<name>[\\w]+)\| | char | 10 | 12 |
+| (?P<name>[\\w]+)\| | char-set | 9 | 13 |
+| (?P<name>[\\w]+)\| | choice | 0 | 16 |
+| (?P<name>[\\w]+)\| | non-empty group | 0 | 15 |
+| (?P<name>[\\w]+)\| | qualified | 9 | 14 |
+| (?P<name>[\\w]+)\| | sequence | 0 | 15 |
+| (?m)^(?!$) | $ | 8 | 9 |
+| (?m)^(?!$) | ^ | 4 | 5 |
+| (?m)^(?!$) | empty group | 0 | 4 |
+| (?m)^(?!$) | empty group | 5 | 10 |
+| (?m)^(?!$) | sequence | 0 | 10 |
+| (?m)^(?!$) | sequence | 8 | 9 |
+| (\\033\|~{) | char | 1 | 5 |
+| (\\033\|~{) | char | 6 | 7 |
+| (\\033\|~{) | char | 7 | 8 |
+| (\\033\|~{) | choice | 1 | 8 |
+| (\\033\|~{) | non-empty group | 0 | 9 |
+| (\\033\|~{) | sequence | 0 | 9 |
+| (\\033\|~{) | sequence | 1 | 5 |
+| (\\033\|~{) | sequence | 6 | 8 |
+| [\ufffd-\ufffd] | char | 1 | 2 |
+| [\ufffd-\ufffd] | char | 3 | 4 |
+| [\ufffd-\ufffd] | char-set | 0 | 5 |
+| [\ufffd-\ufffd] | sequence | 0 | 5 |
+| [\ufffd-\ufffd][\ufffd-\ufffd] | char | 1 | 2 |
+| [\ufffd-\ufffd][\ufffd-\ufffd] | char | 3 | 4 |
+| [\ufffd-\ufffd][\ufffd-\ufffd] | char | 6 | 7 |
+| [\ufffd-\ufffd][\ufffd-\ufffd] | char | 8 | 9 |
+| [\ufffd-\ufffd][\ufffd-\ufffd] | char-set | 0 | 5 |
+| [\ufffd-\ufffd][\ufffd-\ufffd] | char-set | 5 | 10 |
+| [\ufffd-\ufffd][\ufffd-\ufffd] | sequence | 0 | 10 |
+| []] | char | 1 | 2 |
+| []] | char-set | 0 | 3 |
+| []] | sequence | 0 | 3 |
+| [^-] | char | 2 | 3 |
+| [^-] | char-set | 0 | 4 |
+| [^-] | sequence | 0 | 4 |
+| [^A-Z] | char | 2 | 3 |
+| [^A-Z] | char | 4 | 5 |
+| [^A-Z] | char-set | 0 | 6 |
+| [^A-Z] | sequence | 0 | 6 |
+| [^]] | char | 2 | 3 |
+| [^]] | char-set | 0 | 4 |
+| [^]] | sequence | 0 | 4 |
+| \\A[+-]?\\d+ | char | 0 | 2 |
+| \\A[+-]?\\d+ | char | 3 | 4 |
+| \\A[+-]?\\d+ | char | 4 | 5 |
+| \\A[+-]?\\d+ | char | 7 | 9 |
+| \\A[+-]?\\d+ | char-set | 2 | 6 |
+| \\A[+-]?\\d+ | qualified | 2 | 7 |
+| \\A[+-]?\\d+ | qualified | 7 | 10 |
+| \\A[+-]?\\d+ | sequence | 0 | 10 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | char | 0 | 2 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | char | 12 | 13 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | char | 16 | 18 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | char | 18 | 20 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | char | 30 | 31 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | char-set | 10 | 14 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | char-set | 28 | 32 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | non-empty group | 2 | 16 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | non-empty group | 20 | 34 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | qualified | 10 | 15 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | qualified | 28 | 33 |
+| \\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*) | sequence | 0 | 34 |
+| \\\|\\[\\][123]\|\\{\\} | char | 0 | 2 |
+| \\\|\\[\\][123]\|\\{\\} | char | 2 | 4 |
+| \\\|\\[\\][123]\|\\{\\} | char | 4 | 6 |
+| \\\|\\[\\][123]\|\\{\\} | char | 7 | 8 |
+| \\\|\\[\\][123]\|\\{\\} | char | 8 | 9 |
+| \\\|\\[\\][123]\|\\{\\} | char | 9 | 10 |
+| \\\|\\[\\][123]\|\\{\\} | char | 12 | 14 |
+| \\\|\\[\\][123]\|\\{\\} | char | 14 | 16 |
+| \\\|\\[\\][123]\|\\{\\} | char-set | 6 | 11 |
+| \\\|\\[\\][123]\|\\{\\} | choice | 0 | 16 |
+| \\\|\\[\\][123]\|\\{\\} | sequence | 0 | 11 |
+| \\\|\\[\\][123]\|\\{\\} | sequence | 12 | 16 |
+| \|x | char | 1 | 2 |
+| \|x | choice | 0 | 2 |
+| \|x | sequence | 1 | 2 |
+| ^(^y\|^z)(u$\|v$)$ | $ | 10 | 11 |
+| ^(^y\|^z)(u$\|v$)$ | $ | 13 | 14 |
+| ^(^y\|^z)(u$\|v$)$ | $ | 15 | 16 |
+| ^(^y\|^z)(u$\|v$)$ | ^ | 0 | 1 |
+| ^(^y\|^z)(u$\|v$)$ | ^ | 2 | 3 |
+| ^(^y\|^z)(u$\|v$)$ | ^ | 5 | 6 |
+| ^(^y\|^z)(u$\|v$)$ | char | 3 | 4 |
+| ^(^y\|^z)(u$\|v$)$ | char | 6 | 7 |
+| ^(^y\|^z)(u$\|v$)$ | char | 9 | 10 |
+| ^(^y\|^z)(u$\|v$)$ | char | 12 | 13 |
+| ^(^y\|^z)(u$\|v$)$ | choice | 2 | 7 |
+| ^(^y\|^z)(u$\|v$)$ | choice | 9 | 14 |
+| ^(^y\|^z)(u$\|v$)$ | non-empty group | 1 | 8 |
+| ^(^y\|^z)(u$\|v$)$ | non-empty group | 8 | 15 |
+| ^(^y\|^z)(u$\|v$)$ | sequence | 0 | 16 |
+| ^(^y\|^z)(u$\|v$)$ | sequence | 2 | 4 |
+| ^(^y\|^z)(u$\|v$)$ | sequence | 5 | 7 |
+| ^(^y\|^z)(u$\|v$)$ | sequence | 9 | 11 |
+| ^(^y\|^z)(u$\|v$)$ | sequence | 12 | 14 |
+| ^.$ | $ | 2 | 3 |
+| ^.$ | . | 1 | 2 |
+| ^.$ | ^ | 0 | 1 |
+| ^.$ | sequence | 0 | 3 |
+| ^[A-Z_]+$(?<!not-this) | $ | 8 | 9 |
+| ^[A-Z_]+$(?<!not-this) | ^ | 0 | 1 |
+| ^[A-Z_]+$(?<!not-this) | char | 2 | 3 |
+| ^[A-Z_]+$(?<!not-this) | char | 4 | 5 |
+| ^[A-Z_]+$(?<!not-this) | char | 5 | 6 |
+| ^[A-Z_]+$(?<!not-this) | char | 13 | 14 |
+| ^[A-Z_]+$(?<!not-this) | char | 14 | 15 |
+| ^[A-Z_]+$(?<!not-this) | char | 15 | 16 |
+| ^[A-Z_]+$(?<!not-this) | char | 16 | 17 |
+| ^[A-Z_]+$(?<!not-this) | char | 17 | 18 |
+| ^[A-Z_]+$(?<!not-this) | char | 18 | 19 |
+| ^[A-Z_]+$(?<!not-this) | char | 19 | 20 |
+| ^[A-Z_]+$(?<!not-this) | char | 20 | 21 |
+| ^[A-Z_]+$(?<!not-this) | char-set | 1 | 7 |
+| ^[A-Z_]+$(?<!not-this) | empty group | 9 | 22 |
+| ^[A-Z_]+$(?<!not-this) | qualified | 1 | 8 |
+| ^[A-Z_]+$(?<!not-this) | sequence | 0 | 22 |
+| ^[A-Z_]+$(?<!not-this) | sequence | 13 | 21 |
+| ax{01,3} | char | 0 | 1 |
+| ax{01,3} | char | 1 | 2 |
+| ax{01,3} | char | 3 | 4 |
+| ax{01,3} | char | 4 | 5 |
+| ax{01,3} | char | 5 | 6 |
+| ax{01,3} | char | 6 | 7 |
+| ax{01,3} | char | 7 | 8 |
+| ax{01,3} | qualified | 1 | 8 |
+| ax{01,3} | sequence | 0 | 8 |
+| ax{3,} | char | 0 | 1 |
+| ax{3,} | char | 1 | 2 |
+| ax{3,} | char | 3 | 4 |
+| ax{3,} | char | 4 | 5 |
+| ax{3,} | char | 5 | 6 |
+| ax{3,} | qualified | 1 | 6 |
+| ax{3,} | sequence | 0 | 6 |
+| ax{3} | char | 0 | 1 |
+| ax{3} | char | 1 | 2 |
+| ax{3} | char | 3 | 4 |
+| ax{3} | char | 4 | 5 |
+| ax{3} | qualified | 1 | 5 |
+| ax{3} | sequence | 0 | 5 |
+| ax{,3} | char | 0 | 1 |
+| ax{,3} | char | 1 | 2 |
+| ax{,3} | char | 3 | 4 |
+| ax{,3} | char | 4 | 5 |
+| ax{,3} | char | 5 | 6 |
+| ax{,3} | qualified | 1 | 6 |
+| ax{,3} | sequence | 0 | 6 |
+| x\| | char | 0 | 1 |
+| x\| | choice | 0 | 2 |
+| x\| | sequence | 0 | 1 |
+| x\|(?<!\\w)l | char | 0 | 1 |
+| x\|(?<!\\w)l | char | 6 | 8 |
+| x\|(?<!\\w)l | char | 9 | 10 |
+| x\|(?<!\\w)l | choice | 0 | 10 |
+| x\|(?<!\\w)l | empty group | 2 | 9 |
+| x\|(?<!\\w)l | sequence | 0 | 1 |
+| x\|(?<!\\w)l | sequence | 2 | 10 |
+| x\|(?<!\\w)l | sequence | 6 | 8 |
+| x{Not qual} | char | 0 | 1 |
+| x{Not qual} | char | 1 | 2 |
+| x{Not qual} | char | 2 | 3 |
+| x{Not qual} | char | 3 | 4 |
+| x{Not qual} | char | 4 | 5 |
+| x{Not qual} | char | 5 | 6 |
+| x{Not qual} | char | 6 | 7 |
+| x{Not qual} | char | 7 | 8 |
+| x{Not qual} | char | 8 | 9 |
+| x{Not qual} | char | 9 | 10 |
+| x{Not qual} | char | 10 | 11 |
+| x{Not qual} | sequence | 0 | 11 |
diff --git a/python/ql/test/library-tests/regexparser/Regex.ql b/python/ql/test/library-tests/regexparser/Regex.ql
new file mode 100644
index 000000000000..70eff46452f6
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/Regex.ql
@@ -0,0 +1,52 @@
+import python
+import semmle.python.RegexParserExtended
+
+predicate zeroWidthMatch(Regex r) {
+  // missing empty grup here
+  r instanceof ConfGroupRegex
+  or
+  r instanceof AssertionGroupRegex
+}
+
+// import semmle.python.RegexLiteral
+predicate part(Regex r, int start, int end, string kind) {
+  start = r.getStartOffset() and
+  end = r.getEndOffset() and
+  (
+    r instanceof OrRegex and kind = "choice"
+    or
+    r instanceof ChRegex and kind = "char"
+    or
+    r instanceof EscapeClassRegex and kind = "char"
+    or
+    r instanceof ClassChar and kind = "char"
+    or
+    r instanceof SpecialCharRegex and kind = r.getText()
+    or
+    r instanceof SequenceRegex and
+    (
+      r.getParent() instanceof SequenceRegex
+      implies
+      (
+        r.getParent().getStartOffset() < r.getStartOffset() and
+        r.getParent().getEndOffset() > r.getEndOffset()
+      )
+    ) and
+    kind = "sequence"
+    or
+    r instanceof ClassRegex and kind = "char-set"
+    or
+    zeroWidthMatch(r) and kind = "empty group"
+    or
+    r instanceof GroupRegex and not zeroWidthMatch(r) and kind = "non-empty group"
+    or
+    r instanceof SuffixRegex and kind = "qualified"
+  )
+}
+
+from Regex r, Regex part, int start, int end, string kind
+where
+  part(part, start, end, kind) and // and r.hasLocationInfo("test.py", _, _, _, _)
+  r.isRoot() and
+  r = part.getParent*()
+select r.getText(), kind, start, end
diff --git a/python/ql/test/library-tests/regexparser/test.py b/python/ql/test/library-tests/regexparser/test.py
new file mode 100644
index 000000000000..a113b85d3c98
--- /dev/null
+++ b/python/ql/test/library-tests/regexparser/test.py
@@ -0,0 +1,72 @@
+import re
+#            0123456789ABCDEF
+re.compile(r'012345678')
+re.compile(r'(\033|~{)')
+re.compile(r'\A[+-]?\d+')
+re.compile(r'(?P<name>[\w]+)|')
+re.compile(r'\|\[\][123]|\{\}')
+re.compile(r'^.$')
+re.compile(r'[^A-Z]')
+#       0123456789ABCDEF
+re.sub('(?m)^(?!$)', indent*' ', s)
+re.compile("(?:(?:\n\r?)|^)( *)\S")
+re.compile("[]]")
+re.compile("[^]]")
+re.compile("[^-]")
+
+#Lookbehind group
+re.compile(r'x|(?<!\w)l')
+#braces, not qualifier
+re.compile(r"x{Not qual}")
+
+#Multiple carets and dollars
+re.compile("^(^y|^z)(u$|v$)$")
+
+#Multiples
+re.compile("ax{3}")
+re.compile("ax{,3}")
+re.compile("ax{3,}")
+re.compile("ax{01,3}")
+
+#Negative lookahead
+re.compile(r'(?!not-this)^[A-Z_]+$')
+#Negative lookbehind
+re.compile(r'^[A-Z_]+$(?<!not-this)')
+
+
+#OK -- ODASA-ODASA-3968
+re.compile('(?:[^%]|^)?%\((\w*)\)[a-z]')
+
+#ODASA-3985
+#Half Surrogate pairs
+re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
+#Outside BMP
+re.compile(u'[\U00010000-\U0010ffff]')
+
+#Modes
+re.compile("", re.VERBOSE)
+re.compile("", flags=re.VERBOSE)
+re.compile("", re.VERBOSE|re.DOTALL)
+re.compile("", flags=re.VERBOSE|re.IGNORECASE)
+re.search("", None, re.UNICODE)
+x = re.search("", flags=re.UNICODE)
+# using addition for flags was reported as FP in https://github.com/github/codeql/issues/4707
+re.compile("", re.VERBOSE+re.DOTALL)
+# re.X is an alias for re.VERBOSE
+re.compile("", re.X)
+
+#empty choice
+re.compile(r'|x')
+re.compile(r'x|')
+
+#Named group with caret and empty choice.
+re.compile(r'(?:(?P<n1>^(?:|x)))')
+
+#Misparsed on LGTM
+re.compile(r"\[(?P<txt>[^[]*)\]\((?P<uri>[^)]*)")
+
+re.compile("", re.M) # ODASA-8056
+
+# FP reported in https://github.com/github/codeql/issues/3712
+# This does not define a regex (but could be used by other code to do so)
+escaped = re.escape("https://www.humblebundle.com/home/library")

From e73cb0638ead618d6a0a70010c78d191d7246f13 Mon Sep 17 00:00:00 2001
From: Rasmus Lerchedahl Petersen <yoff@github.com>
Date: Mon, 10 May 2021 16:33:03 +0200
Subject: [PATCH 3/8] Python: Add tree view for ReDoS and AST viewer

---
 python/ql/src/semmle/python/RegexTreeView.qll | 309 ++++++++++++++++++
 1 file changed, 309 insertions(+)
 create mode 100644 python/ql/src/semmle/python/RegexTreeView.qll

diff --git a/python/ql/src/semmle/python/RegexTreeView.qll b/python/ql/src/semmle/python/RegexTreeView.qll
new file mode 100644
index 000000000000..69f22fdeed30
--- /dev/null
+++ b/python/ql/src/semmle/python/RegexTreeView.qll
@@ -0,0 +1,309 @@
+import python
+private import semmle.python.RegexLiteral as L
+private import semmle.python.RegexParserExtended as P
+
+/** Defenitions for compatibility with the JS ReDoS query */
+private newtype TRegExpParent =
+  TRegExpLiteral(L::RegexLiteral re) { exists(re.getRegex()) } or
+  TRegExp(P::Regex re) {
+    re.isRooted() and
+    not exists(P::OrRegex par | par.isRooted() and re.(P::OrRegex) = par.getLeft())
+  } or
+  TClassChar(P::ClassChar ch) { ch.getClass().isRooted() and ch.isRooted() } or
+  TClassRange(P::ClassRange rn) { rn.isRooted() }
+
+class RegExpParent extends TRegExpParent {
+  RegExpTerm getChild(int i) { none() }
+
+  RegExpTerm getAChild() { result = getChild(_) }
+
+  RegExpParent getParent() { result.getAChild() = this }
+
+  int getNumChild() { result = count(getAChild()) }
+
+  string toString() { result = "" }
+
+  predicate hasLocationInfo(string file, int startline, int startcol, int endline, int endcol) {
+    none()
+  }
+
+  Location getLocation() {
+    result
+        .hasLocationInfo(this.getFile().getRelativePath(), this.getStartline(), this.getStartcol(),
+          this.getEndline(), this.getEndcol())
+  }
+
+  File getFile() { this.hasLocationInfo(result.getRelativePath(), _, _, _, _) }
+
+  int getStartline() { this.hasLocationInfo(_, result, _, _, _) }
+
+  int getStartcol() { this.hasLocationInfo(_, _, result, _, _) }
+
+  int getEndline() { this.hasLocationInfo(_, _, _, result, _) }
+
+  int getEndcol() { this.hasLocationInfo(_, _, _, _, result) }
+
+  string getRawValue() { result = this.toString() }
+}
+
+class RegExpLiteral extends RegExpParent, TRegExpLiteral {
+  L::RegexLiteral re;
+
+  RegExpLiteral() { this = TRegExpLiteral(re) }
+
+  override RegExpTerm getChild(int i) { result = TRegExp(re.getRegex()) and i = 0 }
+
+  predicate isDotAll() { none() }
+
+  override string toString() { result = re.toString() }
+
+  override predicate hasLocationInfo(
+    string file, int startline, int startcol, int endline, int endcol
+  ) {
+    re.getLocation().hasLocationInfo(file, startline, startcol, endline, endcol)
+  }
+}
+
+class RegExpTerm extends RegExpParent {
+  P::Node node;
+
+  RegExpTerm() {
+    this = TRegExp(node) or
+    this = TClassChar(node) or
+    this = TClassRange(node)
+  }
+
+  predicate isUsedAsRegExp() { any() }
+
+  predicate isRootTerm() { node.isRoot() }
+
+  override string toString() { result = node.toString() }
+
+  RegExpLiteral getLiteral() { result = getRootTerm().getParent() }
+
+  /**
+   * Gets the outermost term of this regular expression.
+   */
+  RegExpTerm getRootTerm() {
+    isRootTerm() and
+    result = this
+    or
+    result = getParent().(RegExpTerm).getRootTerm()
+  }
+
+  override predicate hasLocationInfo(
+    string file, int startline, int startcol, int endline, int endcol
+  ) {
+    node.hasLocationInfo(file, startline, startcol, endline, endcol)
+  }
+}
+
+private class NormalRegExpTerm extends RegExpTerm, TRegExp {
+  override P::Regex node;
+
+  NormalRegExpTerm() { this = TRegExp(node) }
+}
+
+class RegExpAlt extends NormalRegExpTerm {
+  override P::OrRegex node;
+
+  override RegExpTerm getChild(int i) {
+    result = TRegExp(orRevChild(node, orNumChild(node) - i - 1))
+  }
+}
+
+private P::Regex orRevChild(P::Regex re, int i) {
+  i = 0 and
+  not re instanceof P::OrRegex and
+  result = re
+  or
+  i = 0 and
+  result = re.(P::OrRegex).getRight()
+  or
+  i > 0 and
+  result = orRevChild(re.(P::OrRegex).getLeft(), i - 1)
+}
+
+private int orNumChild(P::OrRegex re) { result = strictcount(orRevChild(re, _)) }
+
+class RegExpQuantifier extends NormalRegExpTerm {
+  override P::SuffixRegex node;
+
+  override RegExpTerm getChild(int i) { i = 0 and result = TRegExp(node.getBody()) }
+}
+
+class RegExpLookbehind extends NormalRegExpTerm {
+  RegExpLookbehind() {
+    node instanceof P::NegativeLookbehindRegex or node instanceof P::PositiveLookbehindRegex
+  }
+}
+
+class RegExpStar extends RegExpQuantifier {
+  override P::StarRegex node;
+}
+
+class RegExpPlus extends RegExpQuantifier {
+  override P::PlusRegex node;
+}
+
+class RegExpRange extends RegExpQuantifier {
+  override P::RepeatRegex node;
+
+  int getLowerBound() { result = node.getLowerBound() }
+
+  int getUpperBound() { result = node.getUpperBound() }
+}
+
+class RegExpOpt extends RegExpQuantifier {
+  override P::OptionalRegex node;
+}
+
+class RegExpConstant extends RegExpTerm {
+  RegExpConstant() {
+    this = TRegExp(node.(P::ChRegex))
+    or
+    this = TClassChar(node)
+  }
+
+  predicate isCharacter() { any() }
+
+  string getValue() {
+    result = node.(P::ChRegex).getChar()
+    or
+    result = node.(P::ClassChar).getChar()
+  }
+}
+
+class RegExpDot extends NormalRegExpTerm {
+  override P::DotRegex node;
+}
+
+class RegExpDollar extends NormalRegExpTerm {
+  override P::DollarRegex node;
+}
+
+class RegExpCaret extends NormalRegExpTerm {
+  override P::CaretRegex node;
+}
+
+// predicate findIt()
+class RegExpCharacterClass extends NormalRegExpTerm {
+  override P::ClassRegex node;
+
+  override RegExpTerm getChild(int i) {
+    result = classPart(classChildHelper0(node.getLeftNode().getRightNode(), i))
+  }
+
+  predicate isInverted() { node.isInverted() }
+
+  predicate isUniversalClass() {
+    // [^]
+    isInverted() and not exists(getAChild())
+    or
+    // [\w\W] and similar
+    not isInverted() and
+    exists(string cce1, string cce2 |
+      cce1 = getAChild().(RegExpCharacterClassEscape).getValue() and
+      cce2 = getAChild().(RegExpCharacterClassEscape).getValue()
+    |
+      cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase()
+    )
+  }
+}
+
+private RegExpTerm classPart(P::Node node) {
+  result = TClassChar(node) or
+  result = TClassRange(node) or
+  result = TRegExp(node.(P::EscapeClassRegex))
+}
+
+private P::Node classChildHelper0(P::Node node, int i) {
+  node.hasId("classstart") and i = 0 and result = node
+  or
+  node.getId() = "classstartclassinner1" and
+  (
+    i = 0 and result = node.getLeftNode()
+    or
+    i > 0 and result = classChildHelper1(node.getRightNode(), i - 1)
+  )
+}
+
+private P::Node classChildHelper1(P::Node node, int i) {
+  node.hasId("classinner2") and result = classChildHelper2(node, i)
+  or
+  node.getId() = "classinner2-" and
+  exists(P::Node left, int num |
+    left = node.getLeftNode() and
+    num = classInner2NumChild(left) and
+    (
+      i = num and
+      result = node.getRightNode()
+      or
+      i < num and
+      i >= 0 and
+      result = classChildHelper2(left, i)
+    )
+  )
+}
+
+private int classInner2NumChild(P::Node node) { result = strictcount(classChildHelper2(node, _)) }
+
+private P::Node classChildHelper2(P::Node node, int i) {
+  node.hasId("classpart") and i = 0 and result = node
+  or
+  node.getId() = "classpartclassinner2" and
+  (
+    i = 0 and result = node.getLeftNode()
+    or
+    i > 0 and result = classChildHelper2(node.getRightNode(), i - 1)
+  )
+}
+
+class RegExpCharacterClassEscape extends NormalRegExpTerm {
+  override P::EscapeClassRegex node;
+
+  string getValue() { result = node.getClass() }
+}
+
+class RegExpCharacterRange extends RegExpTerm {
+  override P::ClassRange node;
+
+  RegExpCharacterRange() { this = TClassRange(node) }
+
+  override RegExpTerm getChild(int i) {
+    i = 0 and
+    result = TClassChar(node.getLowerBound())
+    or
+    i = 1 and
+    result = TClassChar(node.getUpperBound())
+  }
+
+  /** Holds if `lo` is the lower bound of this character range and `hi` the upper bound. */
+  predicate isRange(string lo, string hi) {
+    lo = getChild(0).(RegExpConstant).getValue() and
+    hi = getChild(1).(RegExpConstant).getValue()
+  }
+}
+
+class RegExpSequence extends NormalRegExpTerm {
+  override P::SequenceRegex node;
+
+  override RegExpTerm getChild(int i) {
+    i = 0 and
+    result = TRegExp(node.getLeft())
+    or
+    i = 1 and
+    result = TRegExp(node.getRight())
+  }
+}
+
+class RegExpGroup extends NormalRegExpTerm {
+  override P::CaptureRegex node;
+
+  override RegExpTerm getChild(int i) {
+    i = 0 and
+    result = TRegExp(node.getBody())
+  }
+}
+
+RegExpTerm getParsedRegExp(StrConst re) { result = TRegExpLiteral(re).(RegExpLiteral).getChild(0) }

From 8d63d340694f7cfb3ec6025266d901b8ef6a88d4 Mon Sep 17 00:00:00 2001
From: Erik Krogh Kristensen <erik-krogh@github.com>
Date: Tue, 11 May 2021 00:07:09 +0200
Subject: [PATCH 4/8] Python: add printAst support for regular expressions

---
 python/ql/src/semmle/python/PrintAst.qll      | 39 +++++++++++++++++++
 python/ql/src/semmle/python/RegexTreeView.qll | 34 ++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/python/ql/src/semmle/python/PrintAst.qll b/python/ql/src/semmle/python/PrintAst.qll
index 63ec5b53d0a2..59e2458af419 100644
--- a/python/ql/src/semmle/python/PrintAst.qll
+++ b/python/ql/src/semmle/python/PrintAst.qll
@@ -7,6 +7,7 @@
  */
 
 import python
+import semmle.python.RegexTreeView
 
 private newtype TPrintAstConfiguration = MkPrintAstConfiguration()
 
@@ -53,6 +54,9 @@ private newtype TPrintAstNode =
     not list = any(Module mod).getBody() and
     not forall(AstNode child | child = list.getAnItem() | isNotNeeded(child)) and
     exists(list.getAnItem())
+  } or
+  TRegExpTermNode(RegExpTerm term) {
+    exists(StrConst str | term.getRootTerm() = getParsedRegExp(str) and shouldPrint(str, _))
   }
 
 /**
@@ -419,6 +423,41 @@ class ParameterNode extends AstElementNode {
   }
 }
 
+/**
+ * A print node for a `StrConst`.
+ *
+ * The string has a child, if the child is used as a regular expression,
+ * which is the root of the regular expression.
+ */
+class StrConstNode extends AstElementNode {
+  override StrConst element;
+
+  override PrintAstNode getChild(int childIndex) {
+    childIndex = 0 and result.(RegExpTermNode).getTerm() = getParsedRegExp(element)
+  }
+}
+
+/**
+ * A print node for a regular expression term.
+ */
+class RegExpTermNode extends TRegExpTermNode, PrintAstNode {
+  RegExpTerm term;
+
+  RegExpTermNode() { this = TRegExpTermNode(term) }
+
+  RegExpTerm getTerm() { result = term }
+
+  override PrintAstNode getChild(int childIndex) {
+    result.(RegExpTermNode).getTerm() = term.getChild(childIndex)
+  }
+
+  override string toString() {
+    result = "[" + strictconcat(term.getPrimaryQLClass(), " | ") + "] " + term.toString()
+  }
+
+  override Location getLocation() { result = term.getLocation() }
+}
+
 /**
  * Gets the `i`th child from `node` ordered by location.
  */
diff --git a/python/ql/src/semmle/python/RegexTreeView.qll b/python/ql/src/semmle/python/RegexTreeView.qll
index 69f22fdeed30..9014e06707d3 100644
--- a/python/ql/src/semmle/python/RegexTreeView.qll
+++ b/python/ql/src/semmle/python/RegexTreeView.qll
@@ -96,6 +96,8 @@ class RegExpTerm extends RegExpParent {
   ) {
     node.hasLocationInfo(file, startline, startcol, endline, endcol)
   }
+
+  string getPrimaryQLClass() { result = "???" }
 }
 
 private class NormalRegExpTerm extends RegExpTerm, TRegExp {
@@ -110,6 +112,8 @@ class RegExpAlt extends NormalRegExpTerm {
   override RegExpTerm getChild(int i) {
     result = TRegExp(orRevChild(node, orNumChild(node) - i - 1))
   }
+
+  override string getPrimaryQLClass() { result = "RegExpAlt" }
 }
 
 private P::Regex orRevChild(P::Regex re, int i) {
@@ -130,20 +134,28 @@ class RegExpQuantifier extends NormalRegExpTerm {
   override P::SuffixRegex node;
 
   override RegExpTerm getChild(int i) { i = 0 and result = TRegExp(node.getBody()) }
+
+  override string getPrimaryQLClass() { result = "RegExpQuantifier" }
 }
 
 class RegExpLookbehind extends NormalRegExpTerm {
   RegExpLookbehind() {
     node instanceof P::NegativeLookbehindRegex or node instanceof P::PositiveLookbehindRegex
   }
+
+  override string getPrimaryQLClass() { result = "RegExpLookbehind" }
 }
 
 class RegExpStar extends RegExpQuantifier {
   override P::StarRegex node;
+
+  override string getPrimaryQLClass() { result = "RegExpStar" }
 }
 
 class RegExpPlus extends RegExpQuantifier {
   override P::PlusRegex node;
+
+  override string getPrimaryQLClass() { result = "RegExpPlus" }
 }
 
 class RegExpRange extends RegExpQuantifier {
@@ -152,10 +164,14 @@ class RegExpRange extends RegExpQuantifier {
   int getLowerBound() { result = node.getLowerBound() }
 
   int getUpperBound() { result = node.getUpperBound() }
+
+  override string getPrimaryQLClass() { result = "RegExpRange" }
 }
 
 class RegExpOpt extends RegExpQuantifier {
   override P::OptionalRegex node;
+
+  override string getPrimaryQLClass() { result = "RegExpOpt" }
 }
 
 class RegExpConstant extends RegExpTerm {
@@ -172,18 +188,26 @@ class RegExpConstant extends RegExpTerm {
     or
     result = node.(P::ClassChar).getChar()
   }
+
+  override string getPrimaryQLClass() { result = "RegExpConstant" }
 }
 
 class RegExpDot extends NormalRegExpTerm {
   override P::DotRegex node;
+
+  override string getPrimaryQLClass() { result = "RegExpDot" }
 }
 
 class RegExpDollar extends NormalRegExpTerm {
   override P::DollarRegex node;
+
+  override string getPrimaryQLClass() { result = "RegExpDollar" }
 }
 
 class RegExpCaret extends NormalRegExpTerm {
   override P::CaretRegex node;
+
+  override string getPrimaryQLClass() { result = "RegExpCaret" }
 }
 
 // predicate findIt()
@@ -209,6 +233,8 @@ class RegExpCharacterClass extends NormalRegExpTerm {
       cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase()
     )
   }
+
+  override string getPrimaryQLClass() { result = "RegExpCharacterClass" }
 }
 
 private RegExpTerm classPart(P::Node node) {
@@ -263,6 +289,8 @@ class RegExpCharacterClassEscape extends NormalRegExpTerm {
   override P::EscapeClassRegex node;
 
   string getValue() { result = node.getClass() }
+
+  override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" }
 }
 
 class RegExpCharacterRange extends RegExpTerm {
@@ -283,6 +311,8 @@ class RegExpCharacterRange extends RegExpTerm {
     lo = getChild(0).(RegExpConstant).getValue() and
     hi = getChild(1).(RegExpConstant).getValue()
   }
+
+  override string getPrimaryQLClass() { result = "RegExpCharacterRange" }
 }
 
 class RegExpSequence extends NormalRegExpTerm {
@@ -295,6 +325,8 @@ class RegExpSequence extends NormalRegExpTerm {
     i = 1 and
     result = TRegExp(node.getRight())
   }
+
+  override string getPrimaryQLClass() { result = "RegExpSequence" }
 }
 
 class RegExpGroup extends NormalRegExpTerm {
@@ -304,6 +336,8 @@ class RegExpGroup extends NormalRegExpTerm {
     i = 0 and
     result = TRegExp(node.getBody())
   }
+
+  override string getPrimaryQLClass() { result = "RegExpGroup" }
 }
 
 RegExpTerm getParsedRegExp(StrConst re) { result = TRegExpLiteral(re).(RegExpLiteral).getChild(0) }

From 690b0552eca974f891f136bf92bf42a4354f64f7 Mon Sep 17 00:00:00 2001
From: Rasmus Lerchedahl Petersen <yoff@github.com>
Date: Tue, 11 May 2021 13:39:28 +0200
Subject: [PATCH 5/8] Python: Limit strings to parse

---
 python/ql/src/semmle/python/RegexLiteral.qll | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/ql/src/semmle/python/RegexLiteral.qll b/python/ql/src/semmle/python/RegexLiteral.qll
index e6c324098321..02b4df4cac5e 100644
--- a/python/ql/src/semmle/python/RegexLiteral.qll
+++ b/python/ql/src/semmle/python/RegexLiteral.qll
@@ -5,7 +5,10 @@ private import RegexParserExtended
 class RegexLiteralValue extends ParsedString {
   R::Regex lit;
 
-  RegexLiteralValue() { this = lit.getText() }
+  RegexLiteralValue() {
+    this = lit.getText() and
+    exists(lit.getLocation().getFile().getRelativePath())
+  }
 
   override ParserConfiguration getConfiguration() { result instanceof RegexParserConfiguration }
 

From 99cbb11c97e92ad1dc955c73866e38fa0925264e Mon Sep 17 00:00:00 2001
From: Rasmus Lerchedahl Petersen <yoff@github.com>
Date: Thu, 20 May 2021 15:39:50 +0200
Subject: [PATCH 6/8] Python: A number of parser tweaks

---
 .../src/semmle/python/RegexParserExtended.qll | 185 ++++++++++++++----
 python/ql/src/semmle/python/RegexTreeView.qll |   3 +-
 .../regexparser/Alternation.expected          |   2 +-
 .../library-tests/regexparser/Alternation.ql  |  16 +-
 .../regexparser/GroupContents.ql              |  16 +-
 .../library-tests/regexparser/Qualified.ql    |  15 +-
 .../test/library-tests/regexparser/Regex.ql   |   9 -
 7 files changed, 179 insertions(+), 67 deletions(-)

diff --git a/python/ql/src/semmle/python/RegexParserExtended.qll b/python/ql/src/semmle/python/RegexParserExtended.qll
index d6a6a009afed..3f538b054538 100644
--- a/python/ql/src/semmle/python/RegexParserExtended.qll
+++ b/python/ql/src/semmle/python/RegexParserExtended.qll
@@ -60,22 +60,20 @@ class RegexParserConfiguration extends ParserConfiguration {
     regex = "\\(\\?P<\\w+>" and id = "(named"
   }
 
-  predicate testRegex() {
-    //  "(?P<n1>".regexpMatch("\\(\\?P<[:alnum:]+>")
-    "n1".regexpMatch("\\w+")
-  }
-
   /*
    * Use a proper unambiguous grammar for regexes:
    *
    * regex -> orregex
    * orregex -> seqregex
    * |      orregex '|' seqregex
+   * |      '|' seqregex
+   * |      orregex '|'
    * seqregex -> primary
    * |      primary seqregex
    * primary -> group
-   * |      primary *
-   * |      primary +
+   * |      primary '*'
+   * |      primary '+'
+   * |      primary '?'
    * |      char
    * |      class
    * |      escclass
@@ -93,27 +91,32 @@ class RegexParserConfiguration extends ParserConfiguration {
    * |      '[^]' if allowed empty classes
    * classinner -> classstart classinner1
    * |      classstart
-   * classinner1 -> classinner2 '-'
-   * |      classinner2
-   * classinner2 -> classpart
-   * |      classpart classinner2
-   * classstart -> '-'
-   * |      ']' if not allowed empty classes
-   * |      classpart
-   * classpart -> normalchar
-   * |      classrange
+   * |      classinner1
+   * classstart -> ']'
+   * classinner1 -> classpart
+   * |      classpart classinner1
+   * |      classpart_c
+   * |      classpart_c-
+   * |      '-'
+   * classpart_c -> clschar
+   *        classpart_c clschar
+   *        classpart clschar
+   * classpart_c- -> classpart_c '-'
+   * classpart -> // does not end in a clschar
    * |      escclass
-   * classrange -> normalchar '-' normalchar
-   *
+   * |      classpart_c escclass
+   * |      classpart_c- escclass
+   * |      classpart_c- '-'
+   * |      classrange
+   * classrange -> clschar '-' clschar
+   * clschar -> normalchar
+   * |      anychar
+   * |      '(', ')', '|', '+', '*', '?'
    *
    * Things that currently don't parse:
-   * - Empty regexes (as standalone empty strings, or part of a disjunction or group, e.g. `(a|)` or `()`)
-   * - Inline options, i.e. `(?s)`
-   * - Lookaheads/lookbehinds
-   * - Java specific: Nested character classes, intersecting character classes
+   * - Empty regexes (as standalone empty strings, or part of a group, e.g. `()`)
    *
    * Things that parse but with the wrong semantics:
-   * - Possesive and reluctant quantifiers (`a*?` is treated as an optional regex with body `a*`)
    * - Most escape sequences with special meanings (i.e. besides "quote the next character" or predefined character classes)
    */
 
@@ -123,6 +126,8 @@ class RegexParserConfiguration extends ParserConfiguration {
     or
     a = "primary" and result = "seqregex"
     or
+    // a = "orregex|" and result = "orregex"
+    // or
     a = "seqregex" and result = "orregex"
     or
     a = "orregex" and result = "regex"
@@ -132,22 +137,24 @@ class RegexParserConfiguration extends ParserConfiguration {
     a in ["normalchar", "-", "]"] and
     result = "char"
     or
-    a in ["normalchar", "anychar", "()|+*?".charAt(_)] and result = "clschar"
-    or
-    a = "classstart" and result = "classinner"
-    or
-    a = "classinner2" and result = "classinner1"
+    a in ["normalchar", "anychar", "()|+*?[".charAt(_)] and result = "clschar"
     or
-    a in ["classpart", "-"] and result = "classstart"
+    a in ["classstart", "classinner1"] and result = "classinner"
     or
-    a = "classpart" and result = "classinner2"
+    a in ["classpart", "classpart_c", "classpart_c-", "-"] and result = "classinner1"
     or
     a = "]" and not Conf::allowedEmptyClasses() and result = "classstart"
     or
-    a in ["clschar", "classrange", "escclass"] and result = "classpart"
+    a in ["classrange", "escclass"] and result = "classpart"
+    or
+    a = "clschar" and result = "classpart_c"
   }
 
   override string rule(string a, string b) {
+    a = "|" and b = "seqregex" and result = "orregex"
+    or
+    a = "orregex" and b = "|" and result = "orregex"
+    or
     a = "primary" and b = "seqregex" and result = "seqregex"
     or
     a = "primary" and b = "*" and result = "primary"
@@ -168,9 +175,17 @@ class RegexParserConfiguration extends ParserConfiguration {
     or
     a = "classstart" and b = "classinner1" and result = "classinner"
     or
-    a = "classpart" and b = "classinner2" and result = "classinner2"
+    a = "classpart" and b = "classinner1" and result = "classinner1"
+    or
+    a in ["classpart", "classpart_c"] and b = "clschar" and result = "classpart_c"
+    or
+    a = "classpart_c" and b = "-" and result = "classpart_c-"
+    or
+    a = "classpart_c" and b = "escclass" and result = "classpart"
     or
-    a = "classinner2" and b = "-" and result = "classinner1"
+    a = "classpart_c-" and b = "escclass" and result = "classpart"
+    or
+    a = "classpart_c-" and b = "-" and result = "classpart"
   }
 
   override string rule(string a, string b, string c) {
@@ -276,7 +291,9 @@ class ClassRange extends Node {
 }
 
 class SequenceRegex extends Regex {
-  SequenceRegex() { id = "primaryseqregex" }
+  SequenceRegex() {
+    this.hasId("primaryseqregex") and not this.getParent().getId() = "primaryseqregex"
+  }
 
   Regex getLeft() { result = this.getLeftNode() }
 
@@ -284,7 +301,15 @@ class SequenceRegex extends Regex {
 }
 
 abstract class SuffixRegex extends Regex {
-  Regex getBody() { result = this.getLeftNode() }
+  Regex getBody() {
+    if this.isNonGreedy()
+    then result = this.getLeftNode().getLeftNode()
+    else result = this.getLeftNode()
+  }
+
+  abstract predicate isMaybeEmpty();
+
+  abstract predicate isNonGreedy();
 }
 
 abstract class UnboundedRegex extends SuffixRegex { }
@@ -293,14 +318,38 @@ abstract class RepeatRegex extends SuffixRegex {
   abstract int getLowerBound();
 
   abstract int getUpperBound();
+
+  override predicate isMaybeEmpty() { getLowerBound() = 0 }
+
+  override predicate isNonGreedy() { none() }
 }
 
 class StarRegex extends UnboundedRegex {
-  StarRegex() { id = "primary*" }
+  boolean nonGreedy;
+
+  StarRegex() {
+    id = "primary*" and not this.getParent().getId() = "primary?" and nonGreedy = false
+    or
+    id = "primary?" and this.getLeftNode().getId() = "primary*" and nonGreedy = true
+  }
+
+  override predicate isMaybeEmpty() { any() }
+
+  override predicate isNonGreedy() { nonGreedy = true }
 }
 
 class PlusRegex extends UnboundedRegex {
-  PlusRegex() { id = "primary+" }
+  boolean nonGreedy;
+
+  PlusRegex() {
+    id = "primary+" and not this.getParent().getId() = "primary?" and nonGreedy = false
+    or
+    id = "primary?" and this.getLeftNode().getId() = "primary+" and nonGreedy = true
+  }
+
+  override predicate isMaybeEmpty() { none() }
+
+  override predicate isNonGreedy() { nonGreedy = true }
 }
 
 class FixedRepeatRegex extends SuffixRegex, RepeatRegex {
@@ -366,15 +415,48 @@ class OpenRepeatRegex extends UnboundedRegex, RepeatRegex {
 }
 
 class OptionalRegex extends SuffixRegex {
-  OptionalRegex() { id = "primary?" }
+  boolean nonGreedy;
+
+  OptionalRegex() {
+    id = "primary?" and
+    not this.getLeftNode().getId() = "primary*" and
+    not this.getLeftNode().getId() = "primary+" and
+    if this.getLeftNode().getId() = "primary?" then nonGreedy = true else nonGreedy = false
+  }
+
+  override predicate isMaybeEmpty() { any() }
+
+  override predicate isNonGreedy() { nonGreedy = true }
 }
 
-class OrRegex extends Regex {
-  OrRegex() { id = "orregex|seqregex" }
+abstract class OrRegex extends Regex {
+  abstract Regex getLeft();
 
-  Regex getLeft() { result = this.getLeftNode().getLeftNode() }
+  abstract Regex getRight();
+}
 
-  Regex getRight() { result = this.getRightNode() }
+class FullOrRegex extends OrRegex {
+  FullOrRegex() { id = "orregex|seqregex" }
+
+  override Regex getLeft() { result = this.getLeftNode().getLeftNode() }
+
+  override Regex getRight() { result = this.getRightNode() }
+}
+
+class LeftOrRegex extends OrRegex {
+  LeftOrRegex() { id = "orregex|" and not this.getParent() instanceof FullOrRegex }
+
+  override Regex getLeft() { result = this.getLeftNode() }
+
+  override Regex getRight() { none() }
+}
+
+class RightOrRegex extends OrRegex {
+  RightOrRegex() { id = "|seqregex" }
+
+  override Regex getLeft() { none() }
+
+  override Regex getRight() { result = this.getRightNode() }
 }
 
 class CaptureRegex extends Regex {
@@ -389,9 +471,11 @@ class BackrefRegex extends Regex {
 
 class GroupRegex extends Regex {
   GroupRegex() { this.hasId("group") }
+
+  Regex getContents() { result = this.getLeftNode().getRightNode() }
 }
 
-class ConfGroupRegex extends Regex {
+class ConfGroupRegex extends GroupRegex {
   ConfGroupRegex() { this.hasId("confgroup") }
 }
 
@@ -418,7 +502,22 @@ class ParsedRegex extends Regex {
 }
 
 string testTokenize(ParsedString text, string id, int pos, int seq) {
+  // text.toString() = "\\|\\[\\][123]|\\{\\}" and
   // text.toString() = "\\A[+-]?\\d+" and
   text.toString() = "\\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*)" and
+  // text.toString() = "(?m)^(?!$)" and
   result = tokenize(text, id, pos, seq)
 }
+
+predicate testRegex() {
+  "(?P<uri>".regexpMatch("\\(\\?P<\\w+>")
+  // "n1".regexpMatch("\\w+")
+}
+
+predicate testParse(ParsedString s, int start, int next, string id) {
+  // s = "\\A[+-]?\\d+" and
+  // s = "\\|\\[\\][123]|\\{\\}" and
+  // s = "\\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*)" and
+  s = "012345678" and
+  s.nodes(start, next, id)
+}
diff --git a/python/ql/src/semmle/python/RegexTreeView.qll b/python/ql/src/semmle/python/RegexTreeView.qll
index 9014e06707d3..f41a2c7e7a17 100644
--- a/python/ql/src/semmle/python/RegexTreeView.qll
+++ b/python/ql/src/semmle/python/RegexTreeView.qll
@@ -2,7 +2,7 @@ import python
 private import semmle.python.RegexLiteral as L
 private import semmle.python.RegexParserExtended as P
 
-/** Defenitions for compatibility with the JS ReDoS query */
+/** Definitions for compatibility with the JS ReDoS query */
 private newtype TRegExpParent =
   TRegExpLiteral(L::RegexLiteral re) { exists(re.getRegex()) } or
   TRegExp(P::Regex re) {
@@ -174,6 +174,7 @@ class RegExpOpt extends RegExpQuantifier {
   override string getPrimaryQLClass() { result = "RegExpOpt" }
 }
 
+// TODO: This is supposed to be a constant sequence.
 class RegExpConstant extends RegExpTerm {
   RegExpConstant() {
     this = TRegExp(node.(P::ChRegex))
diff --git a/python/ql/test/library-tests/regexparser/Alternation.expected b/python/ql/test/library-tests/regexparser/Alternation.expected
index 2fe6572074e6..e50655fdc24b 100644
--- a/python/ql/test/library-tests/regexparser/Alternation.expected
+++ b/python/ql/test/library-tests/regexparser/Alternation.expected
@@ -19,4 +19,4 @@
 | x\| | 0 | 2 | x\| | 0 | 1 | x |
 | x\| | 0 | 2 | x\| | 2 | 2 |  |
 | x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 0 | 1 | x |
-| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |
\ No newline at end of file
+| x\|(?<!\\w)l | 0 | 10 | x\|(?<!\\w)l | 2 | 10 | (?<!\\w)l |
diff --git a/python/ql/test/library-tests/regexparser/Alternation.ql b/python/ql/test/library-tests/regexparser/Alternation.ql
index b369f822d4ab..fe7b503099b0 100644
--- a/python/ql/test/library-tests/regexparser/Alternation.ql
+++ b/python/ql/test/library-tests/regexparser/Alternation.ql
@@ -1,7 +1,13 @@
 import python
-import semmle.python.regex
+import semmle.python.RegexParserExtended
 
-from Regex r, int start, int end, int part_start, int part_end
-where r.alternationOption(start, end, part_start, part_end)
-select r.getText(), start, end, r.getText().substring(start, end), part_start, part_end,
-  r.getText().substring(part_start, part_end)
+from Regex r, OrRegex orr, Regex part, int start, int end, int part_start, int part_end
+where
+  r.isRoot() and
+  r = orr.getParent*() and
+  start = orr.getStartOffset() and
+  end = orr.getEndOffset() and
+  part in [orr.getLeft(), orr.getRight()] and
+  part_start = part.getStartOffset() and
+  part_end = part.getEndOffset()
+select r.getText(), start, end, orr.getText(), part_start, part_end, part.getText()
diff --git a/python/ql/test/library-tests/regexparser/GroupContents.ql b/python/ql/test/library-tests/regexparser/GroupContents.ql
index 4fd7d9d229ef..5e562a26d228 100644
--- a/python/ql/test/library-tests/regexparser/GroupContents.ql
+++ b/python/ql/test/library-tests/regexparser/GroupContents.ql
@@ -1,7 +1,13 @@
 import python
-import semmle.python.regex
+import semmle.python.RegexParserExtended
 
-from Regex r, int start, int end, int part_start, int part_end
-where r.groupContents(start, end, part_start, part_end)
-select r.getText(), start, end, r.getText().substring(start, end), part_start, part_end,
-  r.getText().substring(part_start, part_end)
+from Regex r, GroupRegex gr, Regex part, int start, int end, int part_start, int part_end
+where
+  r.isRoot() and
+  r = gr.getParent*() and
+  start = gr.getStartOffset() and
+  end = gr.getEndOffset() and
+  part_start = part.getStartOffset() and
+  part_end = part.getEndOffset() and
+  part = gr.getContents()
+select r.getText(), start, end, gr.getText(), part_start, part_end, part.getText()
diff --git a/python/ql/test/library-tests/regexparser/Qualified.ql b/python/ql/test/library-tests/regexparser/Qualified.ql
index 8adf7e0426c2..5b1f9f291224 100644
--- a/python/ql/test/library-tests/regexparser/Qualified.ql
+++ b/python/ql/test/library-tests/regexparser/Qualified.ql
@@ -1,6 +1,15 @@
 import python
-import semmle.python.regex
+import semmle.python.RegexParserExtended
 
-from Regex r, int start, int end, boolean maybe_empty
-where r.qualifiedItem(start, end, maybe_empty)
+from Regex r, SuffixRegex sr, int start, int end, boolean maybe_empty
+where
+  r.isRoot() and
+  r = sr.getParent*() and
+  start = sr.getStartOffset() and
+  end = sr.getEndOffset() and
+  (
+    sr.isMaybeEmpty() and maybe_empty = true
+    or
+    not sr.isMaybeEmpty() and maybe_empty = false
+  )
 select r.getText(), start, end, maybe_empty
diff --git a/python/ql/test/library-tests/regexparser/Regex.ql b/python/ql/test/library-tests/regexparser/Regex.ql
index 70eff46452f6..a53f97979a27 100644
--- a/python/ql/test/library-tests/regexparser/Regex.ql
+++ b/python/ql/test/library-tests/regexparser/Regex.ql
@@ -8,7 +8,6 @@ predicate zeroWidthMatch(Regex r) {
   r instanceof AssertionGroupRegex
 }
 
-// import semmle.python.RegexLiteral
 predicate part(Regex r, int start, int end, string kind) {
   start = r.getStartOffset() and
   end = r.getEndOffset() and
@@ -24,14 +23,6 @@ predicate part(Regex r, int start, int end, string kind) {
     r instanceof SpecialCharRegex and kind = r.getText()
     or
     r instanceof SequenceRegex and
-    (
-      r.getParent() instanceof SequenceRegex
-      implies
-      (
-        r.getParent().getStartOffset() < r.getStartOffset() and
-        r.getParent().getEndOffset() > r.getEndOffset()
-      )
-    ) and
     kind = "sequence"
     or
     r instanceof ClassRegex and kind = "char-set"

From fc5f2e6138d88edacc805731fc4e377cc285ab79 Mon Sep 17 00:00:00 2001
From: Rasmus Lerchedahl Petersen <yoff@github.com>
Date: Thu, 20 May 2021 18:01:42 +0200
Subject: [PATCH 7/8] Python: use constants

---
 .../src/semmle/python/RegexParserExtended.qll | 49 ++++++++++++++++---
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/python/ql/src/semmle/python/RegexParserExtended.qll b/python/ql/src/semmle/python/RegexParserExtended.qll
index 3f538b054538..62ade6893b1b 100644
--- a/python/ql/src/semmle/python/RegexParserExtended.qll
+++ b/python/ql/src/semmle/python/RegexParserExtended.qll
@@ -7,17 +7,23 @@ module RegexSpecific {
 
 import RegexSpecific as Conf
 
+private string escapableChars() { result = "AbBdDsSwWZafnNrtuUvx\\\\" }
+
+private string keywordChars() { result = "()|*+?\\-\\[\\]" }
+
 class RegexParserConfiguration extends ParserConfiguration {
   RegexParserConfiguration() { this = "Extended regex parser configuration" }
 
   override predicate hasTokenRegex(string regex) {
-    regex = "[()|*+?\\-\\[\\]]"
+    regex = "[" + keywordChars() + "]"
     or
     regex = "\\[\\^"
   }
 
   override predicate hasTokenRegex(string regex, string id) {
-    regex = "[^()|.$\\^\\[\\]\\\\]" and id = "normalchar"
+    regex = "[^" + keywordChars() + ".$\\^\\\\]" and id = "normalchar"
+    or
+    regex = "\\\\[^" + escapableChars() + "0-9]" and id = "normalchar"
     or
     regex = "\\\\[0-9]+" and id = "backref"
     or
@@ -37,9 +43,7 @@ class RegexParserConfiguration extends ParserConfiguration {
     or
     regex = "\\{[0-9]+,\\}" and id = "openrepeat"
     or
-    regex = "\\\\[^AbBdDsSwWZafnNrtuUvx\\\\0-9]" and id = "normalchar"
-    or
-    regex = "\\\\[AbBdDsSwWZafnNrtuUvx\\\\]" and id = "escclass"
+    regex = "\\\\[" + escapableChars() + "]" and id = "escclass"
     or
     regex = "\\(\\?[aiLmsux]+\\)" and id = "confgroup"
     or
@@ -504,14 +508,38 @@ class ParsedRegex extends Regex {
 string testTokenize(ParsedString text, string id, int pos, int seq) {
   // text.toString() = "\\|\\[\\][123]|\\{\\}" and
   // text.toString() = "\\A[+-]?\\d+" and
-  text.toString() = "\\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*)" and
+  // text.toString() = "\\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*)" and
   // text.toString() = "(?m)^(?!$)" and
+  text.toString() = "^\\b_((?:__|[^_])+?)_\\b|^\\*((?:\\*\\*|[^*])+?)\\*(?!\\*)" and
   result = tokenize(text, id, pos, seq)
 }
 
-predicate testRegex() {
-  "(?P<uri>".regexpMatch("\\(\\?P<\\w+>")
+string canParse(ParsedString text) { result = text.toString() }
+
+predicate testTokenRegex(string text, string kind) {
+  // "(?P<uri>".regexpMatch("\\(\\?P<\\w+>")
+  // "n1".regexpMatch("\\w+")
+  exists(string regex |
+    any(RegexParserConfiguration c).hasTokenRegex(regex, kind) and
+    text.regexpMatch(regex)
+  ) and
+  text = "_" and
+  kind = "normalchar"
+}
+
+predicate testT(ParsedString text, int length, string failedAt) {
+  unsuccessfullyTokenized(text, length, failedAt) //and
+  // text.toString() = "^\\b_((?:__|[^_])+?)_\\b|^\\*((?:\\*\\*|[^*])+?)\\*(?!\\*)"
+}
+
+predicate testKeywordRegex(string text) {
+  // "(?P<uri>".regexpMatch("\\(\\?P<\\w+>")
   // "n1".regexpMatch("\\w+")
+  exists(string regex |
+    any(RegexParserConfiguration c).hasTokenRegex(regex) and
+    text.regexpMatch(regex)
+  ) and
+  text = "("
 }
 
 predicate testParse(ParsedString s, int start, int next, string id) {
@@ -521,3 +549,8 @@ predicate testParse(ParsedString s, int start, int next, string id) {
   s = "012345678" and
   s.nodes(start, next, id)
 }
+
+string testRawTokens(ParsedString s, int pos, string id) {
+  s.toString() = "^\\b_((?:__|[^_])+?)_\\b|^\\*((?:\\*\\*|[^*])+?)\\*(?!\\*)" and
+  result = s.tokens(pos, id)
+}

From d0f2857f881ecb04912de9247e245a5d3b41a348 Mon Sep 17 00:00:00 2001
From: Rasmus Lerchedahl Petersen <yoff@github.com>
Date: Wed, 26 May 2021 09:53:34 +0200
Subject: [PATCH 8/8] Python: collecting constants part I not having single
 char constants yet all redos results disappeared

---
 .../src/semmle/python/RegexParserExtended.qll | 101 ++++++++++++++----
 python/ql/src/semmle/python/RegexTreeView.qll |  28 +++--
 2 files changed, 95 insertions(+), 34 deletions(-)

diff --git a/python/ql/src/semmle/python/RegexParserExtended.qll b/python/ql/src/semmle/python/RegexParserExtended.qll
index 62ade6893b1b..aba7502c2599 100644
--- a/python/ql/src/semmle/python/RegexParserExtended.qll
+++ b/python/ql/src/semmle/python/RegexParserExtended.qll
@@ -72,13 +72,22 @@ class RegexParserConfiguration extends ParserConfiguration {
    * |      orregex '|' seqregex
    * |      '|' seqregex
    * |      orregex '|'
-   * seqregex -> primary
+   * seqregex -> constant
+   * |      primaryseqregex
+   * |      constant primaryseqregex
+   * primaryseqregex -> // seqregex starting with a primary
+   * |      primary
    * |      primary seqregex
-   * primary -> group
+   * constant -> char
+   * |      char constant
+   * char -> normalchar
+   * |      '-'
+   * |      ']'
+   * primary -> // not a constant
+   * |      group
    * |      primary '*'
    * |      primary '+'
    * |      primary '?'
-   * |      char
    * |      class
    * |      escclass
    * group -> '(' regex ')'
@@ -102,6 +111,7 @@ class RegexParserConfiguration extends ParserConfiguration {
    * |      classpart_c
    * |      classpart_c-
    * |      '-'
+   * |      '-' classinner1
    * classpart_c -> clschar
    *        classpart_c clschar
    *        classpart clschar
@@ -115,7 +125,7 @@ class RegexParserConfiguration extends ParserConfiguration {
    * classrange -> clschar '-' clschar
    * clschar -> normalchar
    * |      anychar
-   * |      '(', ')', '|', '+', '*', '?'
+   * |      '(', ')', '|', '+', '*', '?', '$'
    *
    * Things that currently don't parse:
    * - Empty regexes (as standalone empty strings, or part of a group, e.g. `()`)
@@ -125,13 +135,15 @@ class RegexParserConfiguration extends ParserConfiguration {
    */
 
   override string rule(string a) {
-    a in ["char", "anychar", "dollar", "caret", "backref", "class", "escclass", "group"] and
+    a in ["anychar", "dollar", "caret", "backref", "class", "escclass", "group"] and
     result = "primary"
     or
-    a = "primary" and result = "seqregex"
+    a in ["constant", "primaryseqregex"] and result = "seqregex"
+    or
+    a = "char" and result = "constant"
+    or
+    a = "primary" and result = "primaryseqregex"
     or
-    // a = "orregex|" and result = "orregex"
-    // or
     a = "seqregex" and result = "orregex"
     or
     a = "orregex" and result = "regex"
@@ -141,7 +153,7 @@ class RegexParserConfiguration extends ParserConfiguration {
     a in ["normalchar", "-", "]"] and
     result = "char"
     or
-    a in ["normalchar", "anychar", "()|+*?[".charAt(_)] and result = "clschar"
+    a in ["normalchar", "anychar", "()|+*?[$".charAt(_)] and result = "clschar"
     or
     a in ["classstart", "classinner1"] and result = "classinner"
     or
@@ -159,7 +171,11 @@ class RegexParserConfiguration extends ParserConfiguration {
     or
     a = "orregex" and b = "|" and result = "orregex"
     or
-    a = "primary" and b = "seqregex" and result = "seqregex"
+    a = "constant" and b = "primaryseqregex" and result = "seqregex"
+    or
+    a = "primary" and b = "seqregex" and result = "primaryseqregex"
+    or
+    a = "char" and b = "constant" and result = "constant"
     or
     a = "primary" and b = "*" and result = "primary"
     or
@@ -175,12 +191,28 @@ class RegexParserConfiguration extends ParserConfiguration {
     or
     a = "primary" and b = "openrepeat" and result = "primary"
     or
+    a = "constant" and b = "*" and result = "primary"
+    or
+    a = "constant" and b = "+" and result = "primary"
+    or
+    a = "constant" and b = "?" and result = "primary"
+    or
+    a = "constant" and b = "fixedrepeat" and result = "primary"
+    or
+    a = "constant" and b = "rangerepeat" and result = "primary"
+    or
+    a = "constant" and b = "uptorepeat" and result = "primary"
+    or
+    a = "constant" and b = "openrepeat" and result = "primary"
+    or
     a in ["[", "[^"] and b = "]" and Conf::allowedEmptyClasses() and result = "class"
     or
     a = "classstart" and b = "classinner1" and result = "classinner"
     or
     a = "classpart" and b = "classinner1" and result = "classinner1"
     or
+    a = "-" and b = "classinner1" and result = "classinner1"
+    or
     a in ["classpart", "classpart_c"] and b = "clschar" and result = "classpart_c"
     or
     a = "classpart_c" and b = "-" and result = "classpart_c-"
@@ -304,6 +336,18 @@ class SequenceRegex extends Regex {
   Regex getRight() { result = this.getRightNode() }
 }
 
+class ConstantRegex extends Regex {
+  ConstantRegex() {
+    this.getId() = "charconstant" and not this.getParent().getId() = "charconstant"
+  }
+}
+
+predicate isConst(Regex node, Node parent, string parentId) {
+  node.getId() = "charconstant" and
+  parent = node.getParent() and
+  parent.getId() = parentId
+}
+
 abstract class SuffixRegex extends Regex {
   Regex getBody() {
     if this.isNonGreedy()
@@ -332,9 +376,13 @@ class StarRegex extends UnboundedRegex {
   boolean nonGreedy;
 
   StarRegex() {
-    id = "primary*" and not this.getParent().getId() = "primary?" and nonGreedy = false
+    id = ["primary", "constant"] + "*" and
+    not this.getParent().getId() = "primary?" and
+    nonGreedy = false
     or
-    id = "primary?" and this.getLeftNode().getId() = "primary*" and nonGreedy = true
+    id = ["primary", "constant"] + "?" and
+    this.getLeftNode().getId() = "primary*" and
+    nonGreedy = true
   }
 
   override predicate isMaybeEmpty() { any() }
@@ -346,9 +394,13 @@ class PlusRegex extends UnboundedRegex {
   boolean nonGreedy;
 
   PlusRegex() {
-    id = "primary+" and not this.getParent().getId() = "primary?" and nonGreedy = false
+    id = ["primary", "constant"] + "+" and
+    not this.getParent().getId() = "primary?" and
+    nonGreedy = false
     or
-    id = "primary?" and this.getLeftNode().getId() = "primary+" and nonGreedy = true
+    id = ["primary", "constant"] + "?" and
+    this.getLeftNode().getId() = "primary+" and
+    nonGreedy = true
   }
 
   override predicate isMaybeEmpty() { none() }
@@ -357,7 +409,7 @@ class PlusRegex extends UnboundedRegex {
 }
 
 class FixedRepeatRegex extends SuffixRegex, RepeatRegex {
-  FixedRepeatRegex() { id = "primaryfixedrepeat" }
+  FixedRepeatRegex() { id = ["primary", "constant"] + "fixedrepeat" }
 
   override int getLowerBound() {
     exists(string suff, string num |
@@ -371,7 +423,7 @@ class FixedRepeatRegex extends SuffixRegex, RepeatRegex {
 }
 
 class UptoRepeatRegex extends SuffixRegex, RepeatRegex {
-  UptoRepeatRegex() { id = "primaryuptorepeat" }
+  UptoRepeatRegex() { id = ["primary", "constant"] + "uptorepeat" }
 
   override int getLowerBound() { result = 0 }
 
@@ -385,7 +437,7 @@ class UptoRepeatRegex extends SuffixRegex, RepeatRegex {
 }
 
 class RangeRegex extends SuffixRegex, RepeatRegex {
-  RangeRegex() { id = "primaryrangerepeat" }
+  RangeRegex() { id = ["primary", "constant"] + "rangerepeat" }
 
   override int getLowerBound() {
     exists(string suff, string numl |
@@ -405,7 +457,7 @@ class RangeRegex extends SuffixRegex, RepeatRegex {
 }
 
 class OpenRepeatRegex extends UnboundedRegex, RepeatRegex {
-  OpenRepeatRegex() { id = "primaryopenrepeat" }
+  OpenRepeatRegex() { id = ["primary", "constant"] + "openrepeat" }
 
   override int getLowerBound() {
     exists(string suff, string num |
@@ -422,10 +474,12 @@ class OptionalRegex extends SuffixRegex {
   boolean nonGreedy;
 
   OptionalRegex() {
-    id = "primary?" and
-    not this.getLeftNode().getId() = "primary*" and
-    not this.getLeftNode().getId() = "primary+" and
-    if this.getLeftNode().getId() = "primary?" then nonGreedy = true else nonGreedy = false
+    id = ["primary", "constant"] + "?" and
+    not this.getLeftNode().getId() = ["primary", "constant"] + "*" and
+    not this.getLeftNode().getId() = ["primary", "constant"] + "+" and
+    if this.getLeftNode().getId() = ["primary", "constant"] + "?"
+    then nonGreedy = true
+    else nonGreedy = false
   }
 
   override predicate isMaybeEmpty() { any() }
@@ -510,7 +564,8 @@ string testTokenize(ParsedString text, string id, int pos, int seq) {
   // text.toString() = "\\A[+-]?\\d+" and
   // text.toString() = "\\[(?P<txt>[^[]*)\\]\\((?P<uri>[^)]*)" and
   // text.toString() = "(?m)^(?!$)" and
-  text.toString() = "^\\b_((?:__|[^_])+?)_\\b|^\\*((?:\\*\\*|[^*])+?)\\*(?!\\*)" and
+  // text.toString() = "^\\b_((?:__|[^_])+?)_\\b|^\\*((?:\\*\\*|[^*])+?)\\*(?!\\*)" and
+  text.toString() = "^[\\_$a-z][\\_$a-z0-9]*(\\[.*?\\])*(\\.[\\_$a-z][\\_$a-z0-9]*(\\[.*?\\])*)*$" and
   result = tokenize(text, id, pos, seq)
 }
 
diff --git a/python/ql/src/semmle/python/RegexTreeView.qll b/python/ql/src/semmle/python/RegexTreeView.qll
index f41a2c7e7a17..8504866ead6d 100644
--- a/python/ql/src/semmle/python/RegexTreeView.qll
+++ b/python/ql/src/semmle/python/RegexTreeView.qll
@@ -175,20 +175,26 @@ class RegExpOpt extends RegExpQuantifier {
 }
 
 // TODO: This is supposed to be a constant sequence.
-class RegExpConstant extends RegExpTerm {
-  RegExpConstant() {
-    this = TRegExp(node.(P::ChRegex))
-    or
-    this = TClassChar(node)
-  }
+// class RegExpConstant extends RegExpTerm {
+//   RegExpConstant() {
+//     this = TRegExp(node.(P::ChRegex))
+//     or
+//     this = TClassChar(node)
+//   }
+//   predicate isCharacter() { any() }
+//   string getValue() {
+//     result = node.(P::ChRegex).getChar()
+//     or
+//     result = node.(P::ClassChar).getChar()
+//   }
+//   override string getPrimaryQLClass() { result = "RegExpConstant" }
+// }
+class RegExpConstant extends NormalRegExpTerm {
+  override P::ConstantRegex node;
 
   predicate isCharacter() { any() }
 
-  string getValue() {
-    result = node.(P::ChRegex).getChar()
-    or
-    result = node.(P::ClassChar).getChar()
-  }
+  string getValue() { result = node.getText() }
 
   override string getPrimaryQLClass() { result = "RegExpConstant" }
 }