From bd199b7d8f0273a78b893ef4d810a1f9d4cc1764 Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Mon, 10 May 2021 15:04:57 +0200 Subject: [PATCH 1/8] Python: Add parser --- python/ql/src/semmle/python/Parser.qll | 445 ++++++++++++++++++ python/ql/src/semmle/python/RegexLiteral.qll | 27 ++ .../src/semmle/python/RegexParserExtended.qll | 424 +++++++++++++++++ 3 files changed, 896 insertions(+) create mode 100644 python/ql/src/semmle/python/Parser.qll create mode 100644 python/ql/src/semmle/python/RegexLiteral.qll create mode 100644 python/ql/src/semmle/python/RegexParserExtended.qll diff --git a/python/ql/src/semmle/python/Parser.qll b/python/ql/src/semmle/python/Parser.qll new file mode 100644 index 000000000000..8876a7d86a5f --- /dev/null +++ b/python/ql/src/semmle/python/Parser.qll @@ -0,0 +1,445 @@ +/** + * Parsing framework for QL + * + * Parser is performed as follows: + * 1. Search for all tokens using `ParsedString.tokens` + * 2. Perform a left-right tokenization, rejecting spurious tokens + * 3. Remove all tokens marked as whitespace or comments. + * 4. Put all tokens into sequence. + * 5. Perform a bottom-up parse of the text. + * + * The parsing algorithm is as follows: + * - all tokens are nodes. + * - for all pairs of adjacent nodes, merge them according to the rules of the grammar + * specifed by `ParserConfiguraiton.rule(...)` + * - iterate until there are no more nodes to generate. + * + * Steps to implement a parser: + * + * 1) Implement a parser configuration by extending the `ParserConfiguration` class. + * + * 1a) Specify the tokenizer. + * + * Specify the tokenizer by providing regexes to match keywords and tokens in the language. + * override `ParserConfiguration.hasTokenRegex`, `ParserConfiguration.hasWhitespaceRegex` + * and `ParserConfiguration.hasCommentRegex`. + * + * For tokens like keywords, the id of the token/node is equal to the matched string. + * + * For tokens like identifiers, use `ParserConfiguration.hasTokenRegex/2` to specify the id of + * the matched token. + * + * 1b) Specify the grammar rules. + * + * Override `ParserConfiguration.rule(...)` to specify the grammar rules. + * + * 2) Extend the class `ParsedString` with strings that contain the text you want to parse. + * Initially, these will simply be test cases. + * Then, it can be literals from a snapshot. + * Then, it will be whole files from the universal extractor matching the right file extension. + * + * 3) Create QL classes for interesting nodes. These will be of the form + * ``` + * abstract class ArithmeticExpr extends ExprNode { + * // All nodes are binary. `this.getLeftNode()` is an intermediate node. + * ExprNode getLeft() { result = this.getLeftNode().getLeftNode() } + * + * ExprNode getRight() { result = this.getRightNode() } + * } + * + * class SqlAddExpr extends ArithmeticExpr { + * SqlAddExpr() { id="expr+expr" } // The exact synthesized node id. + * } + * ``` + */ + +/** + * The configuration of a parser. + * + * Extend this class with each language you need to parse. + */ +abstract class ParserConfiguration extends string { + bindingset[this] + ParserConfiguration() { any() } + + predicate hasFileExtension(string ext) { none() } + + /** Tokens whose id is the same as the token text. */ + predicate hasTokenRegex(string regex) { none() } + + /** Whitespace tokens. */ + predicate hasWhitespaceRegex(string regex) { none() } + + /** Coment tokens. */ + predicate hasCommentRegex(string regex) { none() } + + /** Any other tokens not covered can map to a given token-id. */ + predicate hasTokenRegex(string regex, string id) { none() } + + /** + * Grammar rules of the form + * result -> a + * + * The parser does not generate rules of id=`result`, but instead it + * searches for nodes of id=`a` when considering nodes to create. + */ + string rule(string a) { none() } + + /** + * Grammar rules of the form + * result -> a b + * + * If the parser sees a node id=`a` next to a node of id=`b`, + * then the parser creates a node of id=`a+b`. + * Nodes of id=`a+b` are considered to be nodes of id=`result` + * when considering nodes to create. + */ + string rule(string a, string b) { none() } + + /** + * Grammar rules of the form + * result -> a b c + */ + string rule(string a, string b, string c) { none() } + + private string convert(string a) { + result = rule(a) + or + exists(string a0, string b0, string c0 | result = rule(a0, b0, c0) and a = a0 + b0 + c0) + or + exists(string a0, string b0 | result = rule(a0, b0) and a = a0 + b0) + } + + bindingset[fromKind] + private string convertS(string fromKind) { + result = fromKind + or + result = convert(fromKind) + or + result = convert(convert(fromKind)) + or + result = convert(convert(convert(fromKind))) + or + result = convert(convert(convert(convert(fromKind)))) + } + + private string merge(string a, string b) { + exists(rule(a, b)) and result = a + b + or + exists(string a1, string b1, string c1 | exists(rule(a1, b1, c1)) | + a = a1 and b = b1 and result = a1 + b1 + or + a = a1 + b1 and b = c1 and result = a1 + b1 + c1 + ) + } + + predicate validSrc(string src) { + src = convert(_) or + exists(convert(src)) or + exists(merge(src, _)) or + exists(merge(_, src)) + } + + string convert2(string s) { + validSrc(s) and result = s + or + result = this.convert(s) + or + result = this.convert(convert2(s)) + } + + string merge2(string a, string b) { result = merge(convert2(a), convert2(b)) } + + predicate hasInterpolationRegex(string regex, string id) { none() } +} + +/** + * A string to be parsed. + */ +abstract class ParsedString extends string { + bindingset[this] + ParsedString() { any() } + + abstract ParserConfiguration getConfiguration(); + + /** + * Gets the tokens in the string. + * Override this predicate to implement your tokenizer. + * `start` is the offset of the token in this string. + * `id` is a meaningful identifier. + */ + cached + string tokens(int pos, string id) { + exists(ParserConfiguration config | config = this.getConfiguration() | + result = this.keywordToken(pos) and id = result.toUpperCase() + or + exists(string regex | config.hasWhitespaceRegex(regex) | + result = this.regexpFind(regex, _, pos) and id = "ws" + ) + or + exists(string regex | config.hasCommentRegex(regex) | + result = this.regexpFind(regex, _, pos) and id = "comment" + ) + or + exists(string regex | config.hasTokenRegex(regex, id) | + result = this.regexpFind(regex, _, pos) and + not result = this.keywordToken(pos) + ) + ) + } + + // Shouldn't need to cache this + cached + private string keywordToken(int pos) { + exists(string regex | this.getConfiguration().hasTokenRegex(regex) | + result = this.regexpFind(regex, _, pos) + ) + } + + /** + * Gets the syntax nodes in this parsed string. + * override this predicate with the grammar rules. + */ + abstract predicate getLocationInfo( + string file, int startLine, int startCol, int endLine, int endCol + ); + + // This is basically the parsing algorithm. + // - All tokens are nodes. + // - If you find two adjcacent nodes that can be merged, create a new node. + predicate nodes(int start, int next, string id) { + exists(tokenize(this, id, _, start)) and next = start + 1 + or + exists(ParserConfiguration config, int mid, string id0, string id1 | + this.nodes(start, mid, id0) and + this.nodes(mid, next, id1) and + config = this.getConfiguration() and + id = config.merge2(id0, id1) + //id = config.merge(config.convertS(id0), config.convertS(id1)) + ) + } +} + +private predicate lines(ParsedString str, int index, int line) { + line = 0 and index = 0 + or + index = rank[line](int x | x = str.indexOf("\n") or x = str.length()) +} + +// Maps the position `pos` to a row and column +// within the string `text`. This is used for computing +// locations to nodes. +private predicate rowCol(ParsedString str, int index, int line, int col) { + exists(int index1, int index2 | + lines(str, index1, line - 1) and + lines(str, index2, line) and + index in [index1 .. index2 - 1] and + col = index - index1 + ) +} + +newtype TNode = + TNonterminalNode(ParsedString text, int startIndex, int endIndex, string id) { + text.nodes(startIndex, endIndex, id) + } + +/** + * Recursive predicate representing all nodes in the parse tree. + */ +predicate nodes(TNode node, ParsedString text, int start, int next, string id) { + node = TNonterminalNode(text, start, next, id) +} + +/** + * A syntax node. + */ +class Node extends TNode { + ParsedString text; + int start; + int next; + string id; + + /** Gets the token- or node- id of this node. */ + string getId() { result = id } + + /** Holds if this node is convertible to `toid`. */ + predicate hasId(string toid) { toid = text.getConfiguration().convert2(id) } + + /** Gets the offset of the text in the string. */ + int getStartOffset() { exists(tokenize(text, _, result, start)) } + + /** Gets the offset of the end of the text in the string. */ + int getEndOffset1() { exists(int pos | result = tokenize(text, _, pos, next - 1).length() + pos) } + + int getEndOffset2() { + // The offset of the end of the first token! + exists(int startPos | result = tokenize(text, _, startPos, start).length() + startPos) + } + + int getEndOffset() { + // The end offset isn't the end of the last token + // if this node is an interpolated string. + if this.getEndOffset1() < this.getEndOffset2() + then result = this.getEndOffset2() + else result = this.getEndOffset1() + } + + Node() { nodes(this, text, start, next, id) } + + predicate isBefore(Node other) { + exists(int otherstart | + nodes(other, text, otherstart, _, _) and + start < otherstart + ) + } + + string toString() { result = this.getText() } + + string getText() { result = text.substring(this.getStartOffset(), this.getEndOffset()) } + + /** + * Creates a location for the node using the location of the text, + * then adjust the starts and ends based on `start` and `end`. + */ + predicate hasLocationInfo(string file, int startLine, int startCol, int endLine, int endCol) { + exists(int line, int col | + text.getLocationInfo(file, line, col, _, _) and + nodeLocation(text, this.getStartOffset(), line, col, startLine, startCol) and + nodeLocation(text, this.getEndOffset() - 1, line, col, endLine, endCol) + ) + } + + predicate follows(Node previous) { nodes(previous, text, _, start, _) } + + /** + * Gets the left child of this node, if any. + * All nodes are terminal or binary. + */ + Node getLeftNode() { this.splits(result, _) } + + /** + * Gets the right child of this node, if any. + * All nodes are terminal or binary. + */ + Node getRightNode() { this.splits(_, result) } + + predicate splits(Node left, Node right) { + exists(ParserConfiguration config, int mid, string id0, string id1 | + nodes(left, text, start, mid, id0) and + nodes(right, text, mid, next, id1) and + id = config.merge2(id0, id1) + ) + } + + /** Gets a child node of this node, if any. */ + Node getAChildNode() { result = this.getLeftNode() or result = this.getRightNode() } + + /** Gets the parent of this node, if any. */ + Node getParent() { this = result.getAChildNode() } + + /** + * Holds if this is the root node that spans the entire input. + * It is not sufficient to not have a parent: that could be a parsed fragment. + */ + predicate isRoot() { start = 1 and next = getNumberOfTokens(text) + 1 } + + /** Holds if this node has a path to the root node. */ + predicate isRooted() { this.isRoot() or this.getParent().isRooted() } +} + +/** A node that is a token. */ +class Token extends Node { + Token() { next = start + 1 } +} + +pragma[noopt] +private predicate nodeLocation(ParsedString text, int pos, int line0, int col0, int line, int col) { + text.getLocationInfo(_, line0, col0, _, _) and + exists(int l, int c | rowCol(text, pos, l, c) | + line = line0 + l and + col = col0 + c + ) +} + +/** + * Performs a tokenization of the source text, ensuring that + * tokens are contiguous to remove spurious tokens (e.g. contents of strings). + * This tokenization includes whitespace and comment tokens + * that we will filter out later (in `nonWs`). + */ +pragma[noopt] +private string leftrightTokenize(ParsedString text, string id, int pos) { + result = longestToken(text, id, 0) and pos = 0 + or + exists(string prevText, int prevPos, int prevLength | + prevText = leftrightTokenize(text, _, prevPos) and + prevLength = prevText.length() and + pos = prevPos + prevLength and + result = longestToken(text, id, pos) + ) +} + +private string interpolatedToken(ParsedString text, string id, int pos) { + exists(string regex | text.getConfiguration().hasInterpolationRegex(regex, id) | + result = text.regexpFind(regex, _, pos) + ) +} + +// Special handling of interpolated strings +private string interpolatedStringTokens(ParsedString text, string id, int pos) { + exists(string interpolatedString, int start, int end | + interpolatedString = leftrightTokenize(text, "interpolatedstring", start) and + end = start + interpolatedString.length() + | + result = interpolatedToken(text, id, pos) and + pos > start and + pos < start + end - 1 + // pos in [start + 1, start + end - 1] + ) +} + +// Debugging predicate: Indicates which source strings are successfully tokenized. +predicate successfullyTokenized(ParsedString text) { tokenizedLength(text) = text.length() } + +// Debugging predicate: Indicates which source strings have not been +// successfully tokenized. Quick-eval this to see where the tokenizer has failed. +predicate unsuccessfullyTokenized(ParsedString text, int length, string failedAt) { + length = tokenizedLength(text) and + not successfullyTokenized(text) and + failedAt = text.suffix(length) +} + +// Gets the number of characters that were successfully tokenized in a source string. +// This is useful to debug the tokenizer. +int tokenizedLength(ParsedString text) { + exists(int maxpos | + maxpos = max(int pos | exists(leftrightTokenize(text, _, pos))) and + result = maxpos + leftrightTokenize(text, _, maxpos).length() + ) +} + +// Tidy up `tokens`. If the same position can have two different tokens, +// pick the longest token. +private string longestToken(ParsedString text, string id, int pos) { + result = text.tokens(pos, id) and + not text.tokens(pos, _).length() > result.length() +} + +private string nonWs(ParsedString text, string id, int pos) { + result = leftrightTokenize(text, id, pos) and + (id != "ws" and id != "comment") + or + result = interpolatedStringTokens(text, id, pos) +} + +/** + * Tokenizes the string left-right, removing whitespace and comments, + * and creates a rank `seq` for each token for all non-whitespace tokens. + */ +cached +string tokenize(ParsedString text, string id, int pos, int seq) { + pos = rank[seq](int p | exists(nonWs(text, _, p)) | p) and + result = nonWs(text, id, pos) +} + +int getNumberOfTokens(ParsedString text) { result = max(int n | exists(tokenize(text, _, _, n))) } diff --git a/python/ql/src/semmle/python/RegexLiteral.qll b/python/ql/src/semmle/python/RegexLiteral.qll new file mode 100644 index 000000000000..e6c324098321 --- /dev/null +++ b/python/ql/src/semmle/python/RegexLiteral.qll @@ -0,0 +1,27 @@ +import python +import semmle.python.regex as R +private import RegexParserExtended + +class RegexLiteralValue extends ParsedString { + R::Regex lit; + + RegexLiteralValue() { this = lit.getText() } + + override ParserConfiguration getConfiguration() { result instanceof RegexParserConfiguration } + + override predicate getLocationInfo( + string file, int startline, int startcol, int endline, int endcol + ) { + lit.getLocation().hasLocationInfo(file, startline + 1, startcol - 2, endline + 1, endcol - 2) + } + + R::Regex getLiteral() { result = lit } +} + +class RegexLiteral extends R::Regex { + RegexLiteralValue val; + + RegexLiteral() { val.getLiteral() = this } + + Regex getRegex() { result.getText() = val and result.isRoot() } +} diff --git a/python/ql/src/semmle/python/RegexParserExtended.qll b/python/ql/src/semmle/python/RegexParserExtended.qll new file mode 100644 index 000000000000..d6a6a009afed --- /dev/null +++ b/python/ql/src/semmle/python/RegexParserExtended.qll @@ -0,0 +1,424 @@ +import Parser +private import RegexLiteral + +module RegexSpecific { + predicate allowedEmptyClasses() { none() } +} + +import RegexSpecific as Conf + +class RegexParserConfiguration extends ParserConfiguration { + RegexParserConfiguration() { this = "Extended regex parser configuration" } + + override predicate hasTokenRegex(string regex) { + regex = "[()|*+?\\-\\[\\]]" + or + regex = "\\[\\^" + } + + override predicate hasTokenRegex(string regex, string id) { + regex = "[^()|.$\\^\\[\\]\\\\]" and id = "normalchar" + or + regex = "\\\\[0-9]+" and id = "backref" + or + regex = "\\(\\?P=\\w+\\)" and id = "backref" + or + regex = "[.]" and id = "anychar" + or + regex = "[$]" and id = "dollar" + or + regex = "[\\^]" and id = "caret" + or + regex = "\\{[0-9]+\\}" and id = "fixedrepeat" + or + regex = "\\{,[0-9]+\\}" and id = "uptorepeat" + or + regex = "\\{[0-9]+,[0-9]+\\}" and id = "rangerepeat" + or + regex = "\\{[0-9]+,\\}" and id = "openrepeat" + or + regex = "\\\\[^AbBdDsSwWZafnNrtuUvx\\\\0-9]" and id = "normalchar" + or + regex = "\\\\[AbBdDsSwWZafnNrtuUvx\\\\]" and id = "escclass" + or + regex = "\\(\\?[aiLmsux]+\\)" and id = "confgroup" + or + regex = "\\(\\?:" and id = "(" + or + regex = "\\(\\?[aiLmsux]*-[imsx]+:" and id = "(" + or + regex = "\\(\\?#" and id = "(?#" + or + regex = "\\(\\?=" and id = "(?=" + or + regex = "\\(\\?!" and id = "(?!" + or + regex = "\\(\\?<=" and id = "(?<=" + or + regex = "\\(\\?" and id = "(named" + } + + predicate testRegex() { + // "(?P".regexpMatch("\\(\\?P<[:alnum:]+>") + "n1".regexpMatch("\\w+") + } + + /* + * Use a proper unambiguous grammar for regexes: + * + * regex -> orregex + * orregex -> seqregex + * | orregex '|' seqregex + * seqregex -> primary + * | primary seqregex + * primary -> group + * | primary * + * | primary + + * | char + * | class + * | escclass + * group -> '(' regex ')' + * | '(?#' regex ')' + * | '(?=' regex ')' + * | '(?!' regex ')' + * | '(?<=' regex ')' + * | '(? '[' classinner ']' + * | '[^' classinner ']' + * | '[]' if allowed empty classes + * | '[^]' if allowed empty classes + * classinner -> classstart classinner1 + * | classstart + * classinner1 -> classinner2 '-' + * | classinner2 + * classinner2 -> classpart + * | classpart classinner2 + * classstart -> '-' + * | ']' if not allowed empty classes + * | classpart + * classpart -> normalchar + * | classrange + * | escclass + * classrange -> normalchar '-' normalchar + * + * + * Things that currently don't parse: + * - Empty regexes (as standalone empty strings, or part of a disjunction or group, e.g. `(a|)` or `()`) + * - Inline options, i.e. `(?s)` + * - Lookaheads/lookbehinds + * - Java specific: Nested character classes, intersecting character classes + * + * Things that parse but with the wrong semantics: + * - Possesive and reluctant quantifiers (`a*?` is treated as an optional regex with body `a*`) + * - Most escape sequences with special meanings (i.e. besides "quote the next character" or predefined character classes) + */ + + override string rule(string a) { + a in ["char", "anychar", "dollar", "caret", "backref", "class", "escclass", "group"] and + result = "primary" + or + a = "primary" and result = "seqregex" + or + a = "seqregex" and result = "orregex" + or + a = "orregex" and result = "regex" + or + a = "confgroup" and result = "group" + or + a in ["normalchar", "-", "]"] and + result = "char" + or + a in ["normalchar", "anychar", "()|+*?".charAt(_)] and result = "clschar" + or + a = "classstart" and result = "classinner" + or + a = "classinner2" and result = "classinner1" + or + a in ["classpart", "-"] and result = "classstart" + or + a = "classpart" and result = "classinner2" + or + a = "]" and not Conf::allowedEmptyClasses() and result = "classstart" + or + a in ["clschar", "classrange", "escclass"] and result = "classpart" + } + + override string rule(string a, string b) { + a = "primary" and b = "seqregex" and result = "seqregex" + or + a = "primary" and b = "*" and result = "primary" + or + a = "primary" and b = "+" and result = "primary" + or + a = "primary" and b = "?" and result = "primary" + or + a = "primary" and b = "fixedrepeat" and result = "primary" + or + a = "primary" and b = "rangerepeat" and result = "primary" + or + a = "primary" and b = "uptorepeat" and result = "primary" + or + a = "primary" and b = "openrepeat" and result = "primary" + or + a in ["[", "[^"] and b = "]" and Conf::allowedEmptyClasses() and result = "class" + or + a = "classstart" and b = "classinner1" and result = "classinner" + or + a = "classpart" and b = "classinner2" and result = "classinner2" + or + a = "classinner2" and b = "-" and result = "classinner1" + } + + override string rule(string a, string b, string c) { + a = "orregex" and b = "|" and c = "seqregex" and result = "orregex" + or + a = "(" and b = "regex" and c = ")" and result = "group" + or + a = "(?#" and b = "regex" and c = ")" and result = "group" + or + a = "(?=" and b = "regex" and c = ")" and result = "group" + or + a = "(?!" and b = "regex" and c = ")" and result = "group" + or + a = "(?<=" and b = "regex" and c = ")" and result = "group" + or + a = "(?[^[]*)\\]\\((?P[^)]*)" and + result = tokenize(text, id, pos, seq) +} From 93c5896eebdbfd097dd2d216899eecd568e6aa9c Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Mon, 10 May 2021 15:06:01 +0200 Subject: [PATCH 2/8] Python: Add regex parser tests --- .../regexparser/Alternation.expected | 22 ++ .../library-tests/regexparser/Alternation.ql | 7 + .../regexparser/Characters.expected | 134 ++++++++++ .../library-tests/regexparser/Characters.ql | 11 + .../regexparser/FirstLast.expected | 103 ++++++++ .../library-tests/regexparser/FirstLast.ql | 12 + .../regexparser/GroupContents.expected | 18 ++ .../regexparser/GroupContents.ql | 7 + .../library-tests/regexparser/Mode.expected | 13 + .../ql/test/library-tests/regexparser/Mode.ql | 5 + .../regexparser/Qualified.expected | 15 ++ .../library-tests/regexparser/Qualified.ql | 6 + .../library-tests/regexparser/Regex.expected | 243 ++++++++++++++++++ .../test/library-tests/regexparser/Regex.ql | 52 ++++ .../ql/test/library-tests/regexparser/test.py | 72 ++++++ 15 files changed, 720 insertions(+) create mode 100644 python/ql/test/library-tests/regexparser/Alternation.expected create mode 100644 python/ql/test/library-tests/regexparser/Alternation.ql create mode 100644 python/ql/test/library-tests/regexparser/Characters.expected create mode 100644 python/ql/test/library-tests/regexparser/Characters.ql create mode 100644 python/ql/test/library-tests/regexparser/FirstLast.expected create mode 100644 python/ql/test/library-tests/regexparser/FirstLast.ql create mode 100644 python/ql/test/library-tests/regexparser/GroupContents.expected create mode 100644 python/ql/test/library-tests/regexparser/GroupContents.ql create mode 100644 python/ql/test/library-tests/regexparser/Mode.expected create mode 100644 python/ql/test/library-tests/regexparser/Mode.ql create mode 100644 python/ql/test/library-tests/regexparser/Qualified.expected create mode 100644 python/ql/test/library-tests/regexparser/Qualified.ql create mode 100644 python/ql/test/library-tests/regexparser/Regex.expected create mode 100644 python/ql/test/library-tests/regexparser/Regex.ql create mode 100644 python/ql/test/library-tests/regexparser/test.py diff --git a/python/ql/test/library-tests/regexparser/Alternation.expected b/python/ql/test/library-tests/regexparser/Alternation.expected new file mode 100644 index 000000000000..2fe6572074e6 --- /dev/null +++ b/python/ql/test/library-tests/regexparser/Alternation.expected @@ -0,0 +1,22 @@ +| (?:(?:\n\r?)\|^)( *)\\S | 3 | 12 | (?:\n\r?)\|^ | 3 | 10 | (?:\n\r?) | +| (?:(?:\n\r?)\|^)( *)\\S | 3 | 12 | (?:\n\r?)\|^ | 11 | 12 | ^ | +| (?:(?P^(?:\|x))) | 14 | 16 | \|x | 14 | 14 | | +| (?:(?P^(?:\|x))) | 14 | 16 | \|x | 15 | 16 | x | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 3 | 9 | [^%]\|^ | 3 | 7 | [^%] | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 3 | 9 | [^%]\|^ | 8 | 9 | ^ | +| (?P[\\w]+)\| | 0 | 16 | (?P[\\w]+)\| | 0 | 15 | (?P[\\w]+) | +| (?P[\\w]+)\| | 0 | 16 | (?P[\\w]+)\| | 16 | 16 | | +| (\\033\|~{) | 1 | 8 | \\033\|~{ | 1 | 5 | \\033 | +| (\\033\|~{) | 1 | 8 | \\033\|~{ | 6 | 8 | ~{ | +| \\\|\\[\\][123]\|\\{\\} | 0 | 16 | \\\|\\[\\][123]\|\\{\\} | 0 | 11 | \\\|\\[\\][123] | +| \\\|\\[\\][123]\|\\{\\} | 0 | 16 | \\\|\\[\\][123]\|\\{\\} | 12 | 16 | \\{\\} | +| \|x | 0 | 2 | \|x | 0 | 0 | | +| \|x | 0 | 2 | \|x | 1 | 2 | x | +| ^(^y\|^z)(u$\|v$)$ | 2 | 7 | ^y\|^z | 2 | 4 | ^y | +| ^(^y\|^z)(u$\|v$)$ | 2 | 7 | ^y\|^z | 5 | 7 | ^z | +| ^(^y\|^z)(u$\|v$)$ | 9 | 14 | u$\|v$ | 9 | 11 | u$ | +| ^(^y\|^z)(u$\|v$)$ | 9 | 14 | u$\|v$ | 12 | 14 | v$ | +| x\| | 0 | 2 | x\| | 0 | 1 | x | +| x\| | 0 | 2 | x\| | 2 | 2 | | +| x\|(?^(?:\|x))) | 10 | 11 | +| (?:(?P^(?:\|x))) | 15 | 16 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 5 | 6 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 8 | 9 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 11 | 12 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 12 | 14 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 15 | 17 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 19 | 21 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 22 | 23 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 24 | 25 | +| (?P[\\w]+)\| | 10 | 12 | +| (?m)^(?!$) | 4 | 5 | +| (?m)^(?!$) | 8 | 9 | +| (\\033\|~{) | 1 | 5 | +| (\\033\|~{) | 6 | 7 | +| (\\033\|~{) | 7 | 8 | +| [\ufffd-\ufffd] | 1 | 2 | +| [\ufffd-\ufffd] | 3 | 4 | +| [\ufffd-\ufffd][\ufffd-\ufffd] | 1 | 2 | +| [\ufffd-\ufffd][\ufffd-\ufffd] | 3 | 4 | +| [\ufffd-\ufffd][\ufffd-\ufffd] | 6 | 7 | +| [\ufffd-\ufffd][\ufffd-\ufffd] | 8 | 9 | +| []] | 1 | 2 | +| [^-] | 2 | 3 | +| [^A-Z] | 2 | 3 | +| [^A-Z] | 4 | 5 | +| [^]] | 2 | 3 | +| \\A[+-]?\\d+ | 0 | 2 | +| \\A[+-]?\\d+ | 3 | 4 | +| \\A[+-]?\\d+ | 4 | 5 | +| \\A[+-]?\\d+ | 7 | 9 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | 0 | 2 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | 12 | 13 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | 16 | 18 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | 18 | 20 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | 30 | 31 | +| \\\|\\[\\][123]\|\\{\\} | 0 | 2 | +| \\\|\\[\\][123]\|\\{\\} | 2 | 4 | +| \\\|\\[\\][123]\|\\{\\} | 4 | 6 | +| \\\|\\[\\][123]\|\\{\\} | 7 | 8 | +| \\\|\\[\\][123]\|\\{\\} | 8 | 9 | +| \\\|\\[\\][123]\|\\{\\} | 9 | 10 | +| \\\|\\[\\][123]\|\\{\\} | 12 | 14 | +| \\\|\\[\\][123]\|\\{\\} | 14 | 16 | +| \|x | 1 | 2 | +| ^(^y\|^z)(u$\|v$)$ | 0 | 1 | +| ^(^y\|^z)(u$\|v$)$ | 2 | 3 | +| ^(^y\|^z)(u$\|v$)$ | 3 | 4 | +| ^(^y\|^z)(u$\|v$)$ | 5 | 6 | +| ^(^y\|^z)(u$\|v$)$ | 6 | 7 | +| ^(^y\|^z)(u$\|v$)$ | 9 | 10 | +| ^(^y\|^z)(u$\|v$)$ | 10 | 11 | +| ^(^y\|^z)(u$\|v$)$ | 12 | 13 | +| ^(^y\|^z)(u$\|v$)$ | 13 | 14 | +| ^(^y\|^z)(u$\|v$)$ | 15 | 16 | +| ^.$ | 0 | 1 | +| ^.$ | 1 | 2 | +| ^.$ | 2 | 3 | +| ^[A-Z_]+$(?^(?:\|x))) | first | 10 | 11 | +| (?:(?P^(?:\|x))) | first | 15 | 16 | +| (?:(?P^(?:\|x))) | last | 15 | 16 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | first | 0 | 11 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | first | 3 | 7 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | first | 8 | 9 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | first | 11 | 12 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | last | 21 | 26 | +| (?P[\\w]+)\| | first | 9 | 13 | +| (?P[\\w]+)\| | first | 9 | 14 | +| (?P[\\w]+)\| | last | 9 | 13 | +| (?P[\\w]+)\| | last | 9 | 14 | +| (?m)^(?!$) | first | 4 | 5 | +| (?m)^(?!$) | first | 8 | 9 | +| (?m)^(?!$) | last | 4 | 5 | +| (?m)^(?!$) | last | 8 | 9 | +| (\\033\|~{) | first | 1 | 5 | +| (\\033\|~{) | first | 6 | 7 | +| (\\033\|~{) | last | 1 | 5 | +| (\\033\|~{) | last | 7 | 8 | +| [\ufffd-\ufffd] | first | 0 | 5 | +| [\ufffd-\ufffd] | last | 0 | 5 | +| [\ufffd-\ufffd][\ufffd-\ufffd] | first | 0 | 5 | +| [\ufffd-\ufffd][\ufffd-\ufffd] | last | 5 | 10 | +| []] | first | 0 | 3 | +| []] | last | 0 | 3 | +| [^-] | first | 0 | 4 | +| [^-] | last | 0 | 4 | +| [^A-Z] | first | 0 | 6 | +| [^A-Z] | last | 0 | 6 | +| [^]] | first | 0 | 4 | +| [^]] | last | 0 | 4 | +| \\A[+-]?\\d+ | first | 0 | 2 | +| \\A[+-]?\\d+ | last | 7 | 9 | +| \\A[+-]?\\d+ | last | 7 | 10 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | first | 0 | 2 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | last | 28 | 32 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | last | 28 | 33 | +| \\\|\\[\\][123]\|\\{\\} | first | 0 | 2 | +| \\\|\\[\\][123]\|\\{\\} | first | 12 | 14 | +| \\\|\\[\\][123]\|\\{\\} | last | 6 | 11 | +| \\\|\\[\\][123]\|\\{\\} | last | 14 | 16 | +| \|x | first | 1 | 2 | +| \|x | last | 1 | 2 | +| ^(^y\|^z)(u$\|v$)$ | first | 0 | 1 | +| ^(^y\|^z)(u$\|v$)$ | first | 2 | 3 | +| ^(^y\|^z)(u$\|v$)$ | first | 3 | 4 | +| ^(^y\|^z)(u$\|v$)$ | first | 5 | 6 | +| ^(^y\|^z)(u$\|v$)$ | first | 6 | 7 | +| ^(^y\|^z)(u$\|v$)$ | last | 9 | 10 | +| ^(^y\|^z)(u$\|v$)$ | last | 10 | 11 | +| ^(^y\|^z)(u$\|v$)$ | last | 12 | 13 | +| ^(^y\|^z)(u$\|v$)$ | last | 13 | 14 | +| ^(^y\|^z)(u$\|v$)$ | last | 15 | 16 | +| ^.$ | first | 0 | 1 | +| ^.$ | first | 1 | 2 | +| ^.$ | last | 1 | 2 | +| ^.$ | last | 2 | 3 | +| ^[A-Z_]+$(?^(?:\|x))) | 0 | 19 | (?:(?P^(?:\|x))) | 3 | 18 | (?P^(?:\|x)) | +| (?:(?P^(?:\|x))) | 3 | 18 | (?P^(?:\|x)) | 10 | 17 | ^(?:\|x) | +| (?:(?P^(?:\|x))) | 11 | 17 | (?:\|x) | 14 | 16 | \|x | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 0 | 10 | (?:[^%]\|^) | 3 | 9 | [^%]\|^ | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | 14 | 19 | (\\w*) | 15 | 18 | \\w* | +| (?P[\\w]+)\| | 0 | 15 | (?P[\\w]+) | 9 | 14 | [\\w]+ | +| (?m)^(?!$) | 5 | 10 | (?!$) | 8 | 9 | $ | +| (\\033\|~{) | 0 | 9 | (\\033\|~{) | 1 | 8 | \\033\|~{ | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | 2 | 16 | (?P[^[]*) | 10 | 15 | [^[]* | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | 20 | 34 | (?P[^)]*) | 28 | 33 | [^)]* | +| ^(^y\|^z)(u$\|v$)$ | 1 | 8 | (^y\|^z) | 2 | 7 | ^y\|^z | +| ^(^y\|^z)(u$\|v$)$ | 8 | 15 | (u$\|v$) | 9 | 14 | u$\|v$ | +| ^[A-Z_]+$(?[\\w]+)\| | 9 | 14 | false | +| \\A[+-]?\\d+ | 2 | 7 | true | +| \\A[+-]?\\d+ | 7 | 10 | false | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | 10 | 15 | true | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | 28 | 33 | true | +| ^[A-Z_]+$(?^(?:\|x))) | ^ | 10 | 11 | +| (?:(?P^(?:\|x))) | char | 15 | 16 | +| (?:(?P^(?:\|x))) | choice | 14 | 16 | +| (?:(?P^(?:\|x))) | non-empty group | 0 | 19 | +| (?:(?P^(?:\|x))) | non-empty group | 3 | 18 | +| (?:(?P^(?:\|x))) | non-empty group | 11 | 17 | +| (?:(?P^(?:\|x))) | sequence | 0 | 19 | +| (?:(?P^(?:\|x))) | sequence | 3 | 18 | +| (?:(?P^(?:\|x))) | sequence | 10 | 17 | +| (?:(?P^(?:\|x))) | sequence | 15 | 16 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | ^ | 8 | 9 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char | 5 | 6 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char | 11 | 12 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char | 12 | 14 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char | 15 | 17 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char | 19 | 21 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char | 22 | 23 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char | 24 | 25 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char-set | 3 | 7 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | char-set | 21 | 26 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | choice | 3 | 9 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | non-empty group | 0 | 10 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | non-empty group | 14 | 19 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | qualified | 0 | 11 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | qualified | 15 | 18 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | sequence | 0 | 26 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | sequence | 3 | 7 | +| (?:[^%]\|^)?%\\((\\w*)\\)[a-z] | sequence | 8 | 9 | +| (?P[\\w]+)\| | char | 10 | 12 | +| (?P[\\w]+)\| | char-set | 9 | 13 | +| (?P[\\w]+)\| | choice | 0 | 16 | +| (?P[\\w]+)\| | non-empty group | 0 | 15 | +| (?P[\\w]+)\| | qualified | 9 | 14 | +| (?P[\\w]+)\| | sequence | 0 | 15 | +| (?m)^(?!$) | $ | 8 | 9 | +| (?m)^(?!$) | ^ | 4 | 5 | +| (?m)^(?!$) | empty group | 0 | 4 | +| (?m)^(?!$) | empty group | 5 | 10 | +| (?m)^(?!$) | sequence | 0 | 10 | +| (?m)^(?!$) | sequence | 8 | 9 | +| (\\033\|~{) | char | 1 | 5 | +| (\\033\|~{) | char | 6 | 7 | +| (\\033\|~{) | char | 7 | 8 | +| (\\033\|~{) | choice | 1 | 8 | +| (\\033\|~{) | non-empty group | 0 | 9 | +| (\\033\|~{) | sequence | 0 | 9 | +| (\\033\|~{) | sequence | 1 | 5 | +| (\\033\|~{) | sequence | 6 | 8 | +| [\ufffd-\ufffd] | char | 1 | 2 | +| [\ufffd-\ufffd] | char | 3 | 4 | +| [\ufffd-\ufffd] | char-set | 0 | 5 | +| [\ufffd-\ufffd] | sequence | 0 | 5 | +| [\ufffd-\ufffd][\ufffd-\ufffd] | char | 1 | 2 | +| [\ufffd-\ufffd][\ufffd-\ufffd] | char | 3 | 4 | +| [\ufffd-\ufffd][\ufffd-\ufffd] | char | 6 | 7 | +| [\ufffd-\ufffd][\ufffd-\ufffd] | char | 8 | 9 | +| [\ufffd-\ufffd][\ufffd-\ufffd] | char-set | 0 | 5 | +| [\ufffd-\ufffd][\ufffd-\ufffd] | char-set | 5 | 10 | +| [\ufffd-\ufffd][\ufffd-\ufffd] | sequence | 0 | 10 | +| []] | char | 1 | 2 | +| []] | char-set | 0 | 3 | +| []] | sequence | 0 | 3 | +| [^-] | char | 2 | 3 | +| [^-] | char-set | 0 | 4 | +| [^-] | sequence | 0 | 4 | +| [^A-Z] | char | 2 | 3 | +| [^A-Z] | char | 4 | 5 | +| [^A-Z] | char-set | 0 | 6 | +| [^A-Z] | sequence | 0 | 6 | +| [^]] | char | 2 | 3 | +| [^]] | char-set | 0 | 4 | +| [^]] | sequence | 0 | 4 | +| \\A[+-]?\\d+ | char | 0 | 2 | +| \\A[+-]?\\d+ | char | 3 | 4 | +| \\A[+-]?\\d+ | char | 4 | 5 | +| \\A[+-]?\\d+ | char | 7 | 9 | +| \\A[+-]?\\d+ | char-set | 2 | 6 | +| \\A[+-]?\\d+ | qualified | 2 | 7 | +| \\A[+-]?\\d+ | qualified | 7 | 10 | +| \\A[+-]?\\d+ | sequence | 0 | 10 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | char | 0 | 2 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | char | 12 | 13 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | char | 16 | 18 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | char | 18 | 20 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | char | 30 | 31 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | char-set | 10 | 14 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | char-set | 28 | 32 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | non-empty group | 2 | 16 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | non-empty group | 20 | 34 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | qualified | 10 | 15 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | qualified | 28 | 33 | +| \\[(?P[^[]*)\\]\\((?P[^)]*) | sequence | 0 | 34 | +| \\\|\\[\\][123]\|\\{\\} | char | 0 | 2 | +| \\\|\\[\\][123]\|\\{\\} | char | 2 | 4 | +| \\\|\\[\\][123]\|\\{\\} | char | 4 | 6 | +| \\\|\\[\\][123]\|\\{\\} | char | 7 | 8 | +| \\\|\\[\\][123]\|\\{\\} | char | 8 | 9 | +| \\\|\\[\\][123]\|\\{\\} | char | 9 | 10 | +| \\\|\\[\\][123]\|\\{\\} | char | 12 | 14 | +| \\\|\\[\\][123]\|\\{\\} | char | 14 | 16 | +| \\\|\\[\\][123]\|\\{\\} | char-set | 6 | 11 | +| \\\|\\[\\][123]\|\\{\\} | choice | 0 | 16 | +| \\\|\\[\\][123]\|\\{\\} | sequence | 0 | 11 | +| \\\|\\[\\][123]\|\\{\\} | sequence | 12 | 16 | +| \|x | char | 1 | 2 | +| \|x | choice | 0 | 2 | +| \|x | sequence | 1 | 2 | +| ^(^y\|^z)(u$\|v$)$ | $ | 10 | 11 | +| ^(^y\|^z)(u$\|v$)$ | $ | 13 | 14 | +| ^(^y\|^z)(u$\|v$)$ | $ | 15 | 16 | +| ^(^y\|^z)(u$\|v$)$ | ^ | 0 | 1 | +| ^(^y\|^z)(u$\|v$)$ | ^ | 2 | 3 | +| ^(^y\|^z)(u$\|v$)$ | ^ | 5 | 6 | +| ^(^y\|^z)(u$\|v$)$ | char | 3 | 4 | +| ^(^y\|^z)(u$\|v$)$ | char | 6 | 7 | +| ^(^y\|^z)(u$\|v$)$ | char | 9 | 10 | +| ^(^y\|^z)(u$\|v$)$ | char | 12 | 13 | +| ^(^y\|^z)(u$\|v$)$ | choice | 2 | 7 | +| ^(^y\|^z)(u$\|v$)$ | choice | 9 | 14 | +| ^(^y\|^z)(u$\|v$)$ | non-empty group | 1 | 8 | +| ^(^y\|^z)(u$\|v$)$ | non-empty group | 8 | 15 | +| ^(^y\|^z)(u$\|v$)$ | sequence | 0 | 16 | +| ^(^y\|^z)(u$\|v$)$ | sequence | 2 | 4 | +| ^(^y\|^z)(u$\|v$)$ | sequence | 5 | 7 | +| ^(^y\|^z)(u$\|v$)$ | sequence | 9 | 11 | +| ^(^y\|^z)(u$\|v$)$ | sequence | 12 | 14 | +| ^.$ | $ | 2 | 3 | +| ^.$ | . | 1 | 2 | +| ^.$ | ^ | 0 | 1 | +| ^.$ | sequence | 0 | 3 | +| ^[A-Z_]+$(? r.getEndOffset() + ) + ) and + kind = "sequence" + or + r instanceof ClassRegex and kind = "char-set" + or + zeroWidthMatch(r) and kind = "empty group" + or + r instanceof GroupRegex and not zeroWidthMatch(r) and kind = "non-empty group" + or + r instanceof SuffixRegex and kind = "qualified" + ) +} + +from Regex r, Regex part, int start, int end, string kind +where + part(part, start, end, kind) and // and r.hasLocationInfo("test.py", _, _, _, _) + r.isRoot() and + r = part.getParent*() +select r.getText(), kind, start, end diff --git a/python/ql/test/library-tests/regexparser/test.py b/python/ql/test/library-tests/regexparser/test.py new file mode 100644 index 000000000000..a113b85d3c98 --- /dev/null +++ b/python/ql/test/library-tests/regexparser/test.py @@ -0,0 +1,72 @@ +import re +# 0123456789ABCDEF +re.compile(r'012345678') +re.compile(r'(\033|~{)') +re.compile(r'\A[+-]?\d+') +re.compile(r'(?P[\w]+)|') +re.compile(r'\|\[\][123]|\{\}') +re.compile(r'^.$') +re.compile(r'[^A-Z]') +# 0123456789ABCDEF +re.sub('(?m)^(?!$)', indent*' ', s) +re.compile("(?:(?:\n\r?)|^)( *)\S") +re.compile("[]]") +re.compile("[^]]") +re.compile("[^-]") + +#Lookbehind group +re.compile(r'x|(?^(?:|x)))') + +#Misparsed on LGTM +re.compile(r"\[(?P[^[]*)\]\((?P[^)]*)") + +re.compile("", re.M) # ODASA-8056 + +# FP reported in https://github.com/github/codeql/issues/3712 +# This does not define a regex (but could be used by other code to do so) +escaped = re.escape("https://www.humblebundle.com/home/library") From e73cb0638ead618d6a0a70010c78d191d7246f13 Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Mon, 10 May 2021 16:33:03 +0200 Subject: [PATCH 3/8] Python: Add tree view for ReDoS and AST viewer --- python/ql/src/semmle/python/RegexTreeView.qll | 309 ++++++++++++++++++ 1 file changed, 309 insertions(+) create mode 100644 python/ql/src/semmle/python/RegexTreeView.qll diff --git a/python/ql/src/semmle/python/RegexTreeView.qll b/python/ql/src/semmle/python/RegexTreeView.qll new file mode 100644 index 000000000000..69f22fdeed30 --- /dev/null +++ b/python/ql/src/semmle/python/RegexTreeView.qll @@ -0,0 +1,309 @@ +import python +private import semmle.python.RegexLiteral as L +private import semmle.python.RegexParserExtended as P + +/** Defenitions for compatibility with the JS ReDoS query */ +private newtype TRegExpParent = + TRegExpLiteral(L::RegexLiteral re) { exists(re.getRegex()) } or + TRegExp(P::Regex re) { + re.isRooted() and + not exists(P::OrRegex par | par.isRooted() and re.(P::OrRegex) = par.getLeft()) + } or + TClassChar(P::ClassChar ch) { ch.getClass().isRooted() and ch.isRooted() } or + TClassRange(P::ClassRange rn) { rn.isRooted() } + +class RegExpParent extends TRegExpParent { + RegExpTerm getChild(int i) { none() } + + RegExpTerm getAChild() { result = getChild(_) } + + RegExpParent getParent() { result.getAChild() = this } + + int getNumChild() { result = count(getAChild()) } + + string toString() { result = "" } + + predicate hasLocationInfo(string file, int startline, int startcol, int endline, int endcol) { + none() + } + + Location getLocation() { + result + .hasLocationInfo(this.getFile().getRelativePath(), this.getStartline(), this.getStartcol(), + this.getEndline(), this.getEndcol()) + } + + File getFile() { this.hasLocationInfo(result.getRelativePath(), _, _, _, _) } + + int getStartline() { this.hasLocationInfo(_, result, _, _, _) } + + int getStartcol() { this.hasLocationInfo(_, _, result, _, _) } + + int getEndline() { this.hasLocationInfo(_, _, _, result, _) } + + int getEndcol() { this.hasLocationInfo(_, _, _, _, result) } + + string getRawValue() { result = this.toString() } +} + +class RegExpLiteral extends RegExpParent, TRegExpLiteral { + L::RegexLiteral re; + + RegExpLiteral() { this = TRegExpLiteral(re) } + + override RegExpTerm getChild(int i) { result = TRegExp(re.getRegex()) and i = 0 } + + predicate isDotAll() { none() } + + override string toString() { result = re.toString() } + + override predicate hasLocationInfo( + string file, int startline, int startcol, int endline, int endcol + ) { + re.getLocation().hasLocationInfo(file, startline, startcol, endline, endcol) + } +} + +class RegExpTerm extends RegExpParent { + P::Node node; + + RegExpTerm() { + this = TRegExp(node) or + this = TClassChar(node) or + this = TClassRange(node) + } + + predicate isUsedAsRegExp() { any() } + + predicate isRootTerm() { node.isRoot() } + + override string toString() { result = node.toString() } + + RegExpLiteral getLiteral() { result = getRootTerm().getParent() } + + /** + * Gets the outermost term of this regular expression. + */ + RegExpTerm getRootTerm() { + isRootTerm() and + result = this + or + result = getParent().(RegExpTerm).getRootTerm() + } + + override predicate hasLocationInfo( + string file, int startline, int startcol, int endline, int endcol + ) { + node.hasLocationInfo(file, startline, startcol, endline, endcol) + } +} + +private class NormalRegExpTerm extends RegExpTerm, TRegExp { + override P::Regex node; + + NormalRegExpTerm() { this = TRegExp(node) } +} + +class RegExpAlt extends NormalRegExpTerm { + override P::OrRegex node; + + override RegExpTerm getChild(int i) { + result = TRegExp(orRevChild(node, orNumChild(node) - i - 1)) + } +} + +private P::Regex orRevChild(P::Regex re, int i) { + i = 0 and + not re instanceof P::OrRegex and + result = re + or + i = 0 and + result = re.(P::OrRegex).getRight() + or + i > 0 and + result = orRevChild(re.(P::OrRegex).getLeft(), i - 1) +} + +private int orNumChild(P::OrRegex re) { result = strictcount(orRevChild(re, _)) } + +class RegExpQuantifier extends NormalRegExpTerm { + override P::SuffixRegex node; + + override RegExpTerm getChild(int i) { i = 0 and result = TRegExp(node.getBody()) } +} + +class RegExpLookbehind extends NormalRegExpTerm { + RegExpLookbehind() { + node instanceof P::NegativeLookbehindRegex or node instanceof P::PositiveLookbehindRegex + } +} + +class RegExpStar extends RegExpQuantifier { + override P::StarRegex node; +} + +class RegExpPlus extends RegExpQuantifier { + override P::PlusRegex node; +} + +class RegExpRange extends RegExpQuantifier { + override P::RepeatRegex node; + + int getLowerBound() { result = node.getLowerBound() } + + int getUpperBound() { result = node.getUpperBound() } +} + +class RegExpOpt extends RegExpQuantifier { + override P::OptionalRegex node; +} + +class RegExpConstant extends RegExpTerm { + RegExpConstant() { + this = TRegExp(node.(P::ChRegex)) + or + this = TClassChar(node) + } + + predicate isCharacter() { any() } + + string getValue() { + result = node.(P::ChRegex).getChar() + or + result = node.(P::ClassChar).getChar() + } +} + +class RegExpDot extends NormalRegExpTerm { + override P::DotRegex node; +} + +class RegExpDollar extends NormalRegExpTerm { + override P::DollarRegex node; +} + +class RegExpCaret extends NormalRegExpTerm { + override P::CaretRegex node; +} + +// predicate findIt() +class RegExpCharacterClass extends NormalRegExpTerm { + override P::ClassRegex node; + + override RegExpTerm getChild(int i) { + result = classPart(classChildHelper0(node.getLeftNode().getRightNode(), i)) + } + + predicate isInverted() { node.isInverted() } + + predicate isUniversalClass() { + // [^] + isInverted() and not exists(getAChild()) + or + // [\w\W] and similar + not isInverted() and + exists(string cce1, string cce2 | + cce1 = getAChild().(RegExpCharacterClassEscape).getValue() and + cce2 = getAChild().(RegExpCharacterClassEscape).getValue() + | + cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase() + ) + } +} + +private RegExpTerm classPart(P::Node node) { + result = TClassChar(node) or + result = TClassRange(node) or + result = TRegExp(node.(P::EscapeClassRegex)) +} + +private P::Node classChildHelper0(P::Node node, int i) { + node.hasId("classstart") and i = 0 and result = node + or + node.getId() = "classstartclassinner1" and + ( + i = 0 and result = node.getLeftNode() + or + i > 0 and result = classChildHelper1(node.getRightNode(), i - 1) + ) +} + +private P::Node classChildHelper1(P::Node node, int i) { + node.hasId("classinner2") and result = classChildHelper2(node, i) + or + node.getId() = "classinner2-" and + exists(P::Node left, int num | + left = node.getLeftNode() and + num = classInner2NumChild(left) and + ( + i = num and + result = node.getRightNode() + or + i < num and + i >= 0 and + result = classChildHelper2(left, i) + ) + ) +} + +private int classInner2NumChild(P::Node node) { result = strictcount(classChildHelper2(node, _)) } + +private P::Node classChildHelper2(P::Node node, int i) { + node.hasId("classpart") and i = 0 and result = node + or + node.getId() = "classpartclassinner2" and + ( + i = 0 and result = node.getLeftNode() + or + i > 0 and result = classChildHelper2(node.getRightNode(), i - 1) + ) +} + +class RegExpCharacterClassEscape extends NormalRegExpTerm { + override P::EscapeClassRegex node; + + string getValue() { result = node.getClass() } +} + +class RegExpCharacterRange extends RegExpTerm { + override P::ClassRange node; + + RegExpCharacterRange() { this = TClassRange(node) } + + override RegExpTerm getChild(int i) { + i = 0 and + result = TClassChar(node.getLowerBound()) + or + i = 1 and + result = TClassChar(node.getUpperBound()) + } + + /** Holds if `lo` is the lower bound of this character range and `hi` the upper bound. */ + predicate isRange(string lo, string hi) { + lo = getChild(0).(RegExpConstant).getValue() and + hi = getChild(1).(RegExpConstant).getValue() + } +} + +class RegExpSequence extends NormalRegExpTerm { + override P::SequenceRegex node; + + override RegExpTerm getChild(int i) { + i = 0 and + result = TRegExp(node.getLeft()) + or + i = 1 and + result = TRegExp(node.getRight()) + } +} + +class RegExpGroup extends NormalRegExpTerm { + override P::CaptureRegex node; + + override RegExpTerm getChild(int i) { + i = 0 and + result = TRegExp(node.getBody()) + } +} + +RegExpTerm getParsedRegExp(StrConst re) { result = TRegExpLiteral(re).(RegExpLiteral).getChild(0) } From 8d63d340694f7cfb3ec6025266d901b8ef6a88d4 Mon Sep 17 00:00:00 2001 From: Erik Krogh Kristensen Date: Tue, 11 May 2021 00:07:09 +0200 Subject: [PATCH 4/8] Python: add printAst support for regular expressions --- python/ql/src/semmle/python/PrintAst.qll | 39 +++++++++++++++++++ python/ql/src/semmle/python/RegexTreeView.qll | 34 ++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/python/ql/src/semmle/python/PrintAst.qll b/python/ql/src/semmle/python/PrintAst.qll index 63ec5b53d0a2..59e2458af419 100644 --- a/python/ql/src/semmle/python/PrintAst.qll +++ b/python/ql/src/semmle/python/PrintAst.qll @@ -7,6 +7,7 @@ */ import python +import semmle.python.RegexTreeView private newtype TPrintAstConfiguration = MkPrintAstConfiguration() @@ -53,6 +54,9 @@ private newtype TPrintAstNode = not list = any(Module mod).getBody() and not forall(AstNode child | child = list.getAnItem() | isNotNeeded(child)) and exists(list.getAnItem()) + } or + TRegExpTermNode(RegExpTerm term) { + exists(StrConst str | term.getRootTerm() = getParsedRegExp(str) and shouldPrint(str, _)) } /** @@ -419,6 +423,41 @@ class ParameterNode extends AstElementNode { } } +/** + * A print node for a `StrConst`. + * + * The string has a child, if the child is used as a regular expression, + * which is the root of the regular expression. + */ +class StrConstNode extends AstElementNode { + override StrConst element; + + override PrintAstNode getChild(int childIndex) { + childIndex = 0 and result.(RegExpTermNode).getTerm() = getParsedRegExp(element) + } +} + +/** + * A print node for a regular expression term. + */ +class RegExpTermNode extends TRegExpTermNode, PrintAstNode { + RegExpTerm term; + + RegExpTermNode() { this = TRegExpTermNode(term) } + + RegExpTerm getTerm() { result = term } + + override PrintAstNode getChild(int childIndex) { + result.(RegExpTermNode).getTerm() = term.getChild(childIndex) + } + + override string toString() { + result = "[" + strictconcat(term.getPrimaryQLClass(), " | ") + "] " + term.toString() + } + + override Location getLocation() { result = term.getLocation() } +} + /** * Gets the `i`th child from `node` ordered by location. */ diff --git a/python/ql/src/semmle/python/RegexTreeView.qll b/python/ql/src/semmle/python/RegexTreeView.qll index 69f22fdeed30..9014e06707d3 100644 --- a/python/ql/src/semmle/python/RegexTreeView.qll +++ b/python/ql/src/semmle/python/RegexTreeView.qll @@ -96,6 +96,8 @@ class RegExpTerm extends RegExpParent { ) { node.hasLocationInfo(file, startline, startcol, endline, endcol) } + + string getPrimaryQLClass() { result = "???" } } private class NormalRegExpTerm extends RegExpTerm, TRegExp { @@ -110,6 +112,8 @@ class RegExpAlt extends NormalRegExpTerm { override RegExpTerm getChild(int i) { result = TRegExp(orRevChild(node, orNumChild(node) - i - 1)) } + + override string getPrimaryQLClass() { result = "RegExpAlt" } } private P::Regex orRevChild(P::Regex re, int i) { @@ -130,20 +134,28 @@ class RegExpQuantifier extends NormalRegExpTerm { override P::SuffixRegex node; override RegExpTerm getChild(int i) { i = 0 and result = TRegExp(node.getBody()) } + + override string getPrimaryQLClass() { result = "RegExpQuantifier" } } class RegExpLookbehind extends NormalRegExpTerm { RegExpLookbehind() { node instanceof P::NegativeLookbehindRegex or node instanceof P::PositiveLookbehindRegex } + + override string getPrimaryQLClass() { result = "RegExpLookbehind" } } class RegExpStar extends RegExpQuantifier { override P::StarRegex node; + + override string getPrimaryQLClass() { result = "RegExpStar" } } class RegExpPlus extends RegExpQuantifier { override P::PlusRegex node; + + override string getPrimaryQLClass() { result = "RegExpPlus" } } class RegExpRange extends RegExpQuantifier { @@ -152,10 +164,14 @@ class RegExpRange extends RegExpQuantifier { int getLowerBound() { result = node.getLowerBound() } int getUpperBound() { result = node.getUpperBound() } + + override string getPrimaryQLClass() { result = "RegExpRange" } } class RegExpOpt extends RegExpQuantifier { override P::OptionalRegex node; + + override string getPrimaryQLClass() { result = "RegExpOpt" } } class RegExpConstant extends RegExpTerm { @@ -172,18 +188,26 @@ class RegExpConstant extends RegExpTerm { or result = node.(P::ClassChar).getChar() } + + override string getPrimaryQLClass() { result = "RegExpConstant" } } class RegExpDot extends NormalRegExpTerm { override P::DotRegex node; + + override string getPrimaryQLClass() { result = "RegExpDot" } } class RegExpDollar extends NormalRegExpTerm { override P::DollarRegex node; + + override string getPrimaryQLClass() { result = "RegExpDollar" } } class RegExpCaret extends NormalRegExpTerm { override P::CaretRegex node; + + override string getPrimaryQLClass() { result = "RegExpCaret" } } // predicate findIt() @@ -209,6 +233,8 @@ class RegExpCharacterClass extends NormalRegExpTerm { cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase() ) } + + override string getPrimaryQLClass() { result = "RegExpCharacterClass" } } private RegExpTerm classPart(P::Node node) { @@ -263,6 +289,8 @@ class RegExpCharacterClassEscape extends NormalRegExpTerm { override P::EscapeClassRegex node; string getValue() { result = node.getClass() } + + override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" } } class RegExpCharacterRange extends RegExpTerm { @@ -283,6 +311,8 @@ class RegExpCharacterRange extends RegExpTerm { lo = getChild(0).(RegExpConstant).getValue() and hi = getChild(1).(RegExpConstant).getValue() } + + override string getPrimaryQLClass() { result = "RegExpCharacterRange" } } class RegExpSequence extends NormalRegExpTerm { @@ -295,6 +325,8 @@ class RegExpSequence extends NormalRegExpTerm { i = 1 and result = TRegExp(node.getRight()) } + + override string getPrimaryQLClass() { result = "RegExpSequence" } } class RegExpGroup extends NormalRegExpTerm { @@ -304,6 +336,8 @@ class RegExpGroup extends NormalRegExpTerm { i = 0 and result = TRegExp(node.getBody()) } + + override string getPrimaryQLClass() { result = "RegExpGroup" } } RegExpTerm getParsedRegExp(StrConst re) { result = TRegExpLiteral(re).(RegExpLiteral).getChild(0) } From 690b0552eca974f891f136bf92bf42a4354f64f7 Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Tue, 11 May 2021 13:39:28 +0200 Subject: [PATCH 5/8] Python: Limit strings to parse --- python/ql/src/semmle/python/RegexLiteral.qll | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/ql/src/semmle/python/RegexLiteral.qll b/python/ql/src/semmle/python/RegexLiteral.qll index e6c324098321..02b4df4cac5e 100644 --- a/python/ql/src/semmle/python/RegexLiteral.qll +++ b/python/ql/src/semmle/python/RegexLiteral.qll @@ -5,7 +5,10 @@ private import RegexParserExtended class RegexLiteralValue extends ParsedString { R::Regex lit; - RegexLiteralValue() { this = lit.getText() } + RegexLiteralValue() { + this = lit.getText() and + exists(lit.getLocation().getFile().getRelativePath()) + } override ParserConfiguration getConfiguration() { result instanceof RegexParserConfiguration } From 99cbb11c97e92ad1dc955c73866e38fa0925264e Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Thu, 20 May 2021 15:39:50 +0200 Subject: [PATCH 6/8] Python: A number of parser tweaks --- .../src/semmle/python/RegexParserExtended.qll | 185 ++++++++++++++---- python/ql/src/semmle/python/RegexTreeView.qll | 3 +- .../regexparser/Alternation.expected | 2 +- .../library-tests/regexparser/Alternation.ql | 16 +- .../regexparser/GroupContents.ql | 16 +- .../library-tests/regexparser/Qualified.ql | 15 +- .../test/library-tests/regexparser/Regex.ql | 9 - 7 files changed, 179 insertions(+), 67 deletions(-) diff --git a/python/ql/src/semmle/python/RegexParserExtended.qll b/python/ql/src/semmle/python/RegexParserExtended.qll index d6a6a009afed..3f538b054538 100644 --- a/python/ql/src/semmle/python/RegexParserExtended.qll +++ b/python/ql/src/semmle/python/RegexParserExtended.qll @@ -60,22 +60,20 @@ class RegexParserConfiguration extends ParserConfiguration { regex = "\\(\\?P<\\w+>" and id = "(named" } - predicate testRegex() { - // "(?P".regexpMatch("\\(\\?P<[:alnum:]+>") - "n1".regexpMatch("\\w+") - } - /* * Use a proper unambiguous grammar for regexes: * * regex -> orregex * orregex -> seqregex * | orregex '|' seqregex + * | '|' seqregex + * | orregex '|' * seqregex -> primary * | primary seqregex * primary -> group - * | primary * - * | primary + + * | primary '*' + * | primary '+' + * | primary '?' * | char * | class * | escclass @@ -93,27 +91,32 @@ class RegexParserConfiguration extends ParserConfiguration { * | '[^]' if allowed empty classes * classinner -> classstart classinner1 * | classstart - * classinner1 -> classinner2 '-' - * | classinner2 - * classinner2 -> classpart - * | classpart classinner2 - * classstart -> '-' - * | ']' if not allowed empty classes - * | classpart - * classpart -> normalchar - * | classrange + * | classinner1 + * classstart -> ']' + * classinner1 -> classpart + * | classpart classinner1 + * | classpart_c + * | classpart_c- + * | '-' + * classpart_c -> clschar + * classpart_c clschar + * classpart clschar + * classpart_c- -> classpart_c '-' + * classpart -> // does not end in a clschar * | escclass - * classrange -> normalchar '-' normalchar - * + * | classpart_c escclass + * | classpart_c- escclass + * | classpart_c- '-' + * | classrange + * classrange -> clschar '-' clschar + * clschar -> normalchar + * | anychar + * | '(', ')', '|', '+', '*', '?' * * Things that currently don't parse: - * - Empty regexes (as standalone empty strings, or part of a disjunction or group, e.g. `(a|)` or `()`) - * - Inline options, i.e. `(?s)` - * - Lookaheads/lookbehinds - * - Java specific: Nested character classes, intersecting character classes + * - Empty regexes (as standalone empty strings, or part of a group, e.g. `()`) * * Things that parse but with the wrong semantics: - * - Possesive and reluctant quantifiers (`a*?` is treated as an optional regex with body `a*`) * - Most escape sequences with special meanings (i.e. besides "quote the next character" or predefined character classes) */ @@ -123,6 +126,8 @@ class RegexParserConfiguration extends ParserConfiguration { or a = "primary" and result = "seqregex" or + // a = "orregex|" and result = "orregex" + // or a = "seqregex" and result = "orregex" or a = "orregex" and result = "regex" @@ -132,22 +137,24 @@ class RegexParserConfiguration extends ParserConfiguration { a in ["normalchar", "-", "]"] and result = "char" or - a in ["normalchar", "anychar", "()|+*?".charAt(_)] and result = "clschar" - or - a = "classstart" and result = "classinner" - or - a = "classinner2" and result = "classinner1" + a in ["normalchar", "anychar", "()|+*?[".charAt(_)] and result = "clschar" or - a in ["classpart", "-"] and result = "classstart" + a in ["classstart", "classinner1"] and result = "classinner" or - a = "classpart" and result = "classinner2" + a in ["classpart", "classpart_c", "classpart_c-", "-"] and result = "classinner1" or a = "]" and not Conf::allowedEmptyClasses() and result = "classstart" or - a in ["clschar", "classrange", "escclass"] and result = "classpart" + a in ["classrange", "escclass"] and result = "classpart" + or + a = "clschar" and result = "classpart_c" } override string rule(string a, string b) { + a = "|" and b = "seqregex" and result = "orregex" + or + a = "orregex" and b = "|" and result = "orregex" + or a = "primary" and b = "seqregex" and result = "seqregex" or a = "primary" and b = "*" and result = "primary" @@ -168,9 +175,17 @@ class RegexParserConfiguration extends ParserConfiguration { or a = "classstart" and b = "classinner1" and result = "classinner" or - a = "classpart" and b = "classinner2" and result = "classinner2" + a = "classpart" and b = "classinner1" and result = "classinner1" + or + a in ["classpart", "classpart_c"] and b = "clschar" and result = "classpart_c" + or + a = "classpart_c" and b = "-" and result = "classpart_c-" + or + a = "classpart_c" and b = "escclass" and result = "classpart" or - a = "classinner2" and b = "-" and result = "classinner1" + a = "classpart_c-" and b = "escclass" and result = "classpart" + or + a = "classpart_c-" and b = "-" and result = "classpart" } override string rule(string a, string b, string c) { @@ -276,7 +291,9 @@ class ClassRange extends Node { } class SequenceRegex extends Regex { - SequenceRegex() { id = "primaryseqregex" } + SequenceRegex() { + this.hasId("primaryseqregex") and not this.getParent().getId() = "primaryseqregex" + } Regex getLeft() { result = this.getLeftNode() } @@ -284,7 +301,15 @@ class SequenceRegex extends Regex { } abstract class SuffixRegex extends Regex { - Regex getBody() { result = this.getLeftNode() } + Regex getBody() { + if this.isNonGreedy() + then result = this.getLeftNode().getLeftNode() + else result = this.getLeftNode() + } + + abstract predicate isMaybeEmpty(); + + abstract predicate isNonGreedy(); } abstract class UnboundedRegex extends SuffixRegex { } @@ -293,14 +318,38 @@ abstract class RepeatRegex extends SuffixRegex { abstract int getLowerBound(); abstract int getUpperBound(); + + override predicate isMaybeEmpty() { getLowerBound() = 0 } + + override predicate isNonGreedy() { none() } } class StarRegex extends UnboundedRegex { - StarRegex() { id = "primary*" } + boolean nonGreedy; + + StarRegex() { + id = "primary*" and not this.getParent().getId() = "primary?" and nonGreedy = false + or + id = "primary?" and this.getLeftNode().getId() = "primary*" and nonGreedy = true + } + + override predicate isMaybeEmpty() { any() } + + override predicate isNonGreedy() { nonGreedy = true } } class PlusRegex extends UnboundedRegex { - PlusRegex() { id = "primary+" } + boolean nonGreedy; + + PlusRegex() { + id = "primary+" and not this.getParent().getId() = "primary?" and nonGreedy = false + or + id = "primary?" and this.getLeftNode().getId() = "primary+" and nonGreedy = true + } + + override predicate isMaybeEmpty() { none() } + + override predicate isNonGreedy() { nonGreedy = true } } class FixedRepeatRegex extends SuffixRegex, RepeatRegex { @@ -366,15 +415,48 @@ class OpenRepeatRegex extends UnboundedRegex, RepeatRegex { } class OptionalRegex extends SuffixRegex { - OptionalRegex() { id = "primary?" } + boolean nonGreedy; + + OptionalRegex() { + id = "primary?" and + not this.getLeftNode().getId() = "primary*" and + not this.getLeftNode().getId() = "primary+" and + if this.getLeftNode().getId() = "primary?" then nonGreedy = true else nonGreedy = false + } + + override predicate isMaybeEmpty() { any() } + + override predicate isNonGreedy() { nonGreedy = true } } -class OrRegex extends Regex { - OrRegex() { id = "orregex|seqregex" } +abstract class OrRegex extends Regex { + abstract Regex getLeft(); - Regex getLeft() { result = this.getLeftNode().getLeftNode() } + abstract Regex getRight(); +} - Regex getRight() { result = this.getRightNode() } +class FullOrRegex extends OrRegex { + FullOrRegex() { id = "orregex|seqregex" } + + override Regex getLeft() { result = this.getLeftNode().getLeftNode() } + + override Regex getRight() { result = this.getRightNode() } +} + +class LeftOrRegex extends OrRegex { + LeftOrRegex() { id = "orregex|" and not this.getParent() instanceof FullOrRegex } + + override Regex getLeft() { result = this.getLeftNode() } + + override Regex getRight() { none() } +} + +class RightOrRegex extends OrRegex { + RightOrRegex() { id = "|seqregex" } + + override Regex getLeft() { none() } + + override Regex getRight() { result = this.getRightNode() } } class CaptureRegex extends Regex { @@ -389,9 +471,11 @@ class BackrefRegex extends Regex { class GroupRegex extends Regex { GroupRegex() { this.hasId("group") } + + Regex getContents() { result = this.getLeftNode().getRightNode() } } -class ConfGroupRegex extends Regex { +class ConfGroupRegex extends GroupRegex { ConfGroupRegex() { this.hasId("confgroup") } } @@ -418,7 +502,22 @@ class ParsedRegex extends Regex { } string testTokenize(ParsedString text, string id, int pos, int seq) { + // text.toString() = "\\|\\[\\][123]|\\{\\}" and // text.toString() = "\\A[+-]?\\d+" and text.toString() = "\\[(?P[^[]*)\\]\\((?P[^)]*)" and + // text.toString() = "(?m)^(?!$)" and result = tokenize(text, id, pos, seq) } + +predicate testRegex() { + "(?P".regexpMatch("\\(\\?P<\\w+>") + // "n1".regexpMatch("\\w+") +} + +predicate testParse(ParsedString s, int start, int next, string id) { + // s = "\\A[+-]?\\d+" and + // s = "\\|\\[\\][123]|\\{\\}" and + // s = "\\[(?P[^[]*)\\]\\((?P[^)]*)" and + s = "012345678" and + s.nodes(start, next, id) +} diff --git a/python/ql/src/semmle/python/RegexTreeView.qll b/python/ql/src/semmle/python/RegexTreeView.qll index 9014e06707d3..f41a2c7e7a17 100644 --- a/python/ql/src/semmle/python/RegexTreeView.qll +++ b/python/ql/src/semmle/python/RegexTreeView.qll @@ -2,7 +2,7 @@ import python private import semmle.python.RegexLiteral as L private import semmle.python.RegexParserExtended as P -/** Defenitions for compatibility with the JS ReDoS query */ +/** Definitions for compatibility with the JS ReDoS query */ private newtype TRegExpParent = TRegExpLiteral(L::RegexLiteral re) { exists(re.getRegex()) } or TRegExp(P::Regex re) { @@ -174,6 +174,7 @@ class RegExpOpt extends RegExpQuantifier { override string getPrimaryQLClass() { result = "RegExpOpt" } } +// TODO: This is supposed to be a constant sequence. class RegExpConstant extends RegExpTerm { RegExpConstant() { this = TRegExp(node.(P::ChRegex)) diff --git a/python/ql/test/library-tests/regexparser/Alternation.expected b/python/ql/test/library-tests/regexparser/Alternation.expected index 2fe6572074e6..e50655fdc24b 100644 --- a/python/ql/test/library-tests/regexparser/Alternation.expected +++ b/python/ql/test/library-tests/regexparser/Alternation.expected @@ -19,4 +19,4 @@ | x\| | 0 | 2 | x\| | 0 | 1 | x | | x\| | 0 | 2 | x\| | 2 | 2 | | | x\|(? r.getEndOffset() - ) - ) and kind = "sequence" or r instanceof ClassRegex and kind = "char-set" From fc5f2e6138d88edacc805731fc4e377cc285ab79 Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Thu, 20 May 2021 18:01:42 +0200 Subject: [PATCH 7/8] Python: use constants --- .../src/semmle/python/RegexParserExtended.qll | 49 ++++++++++++++++--- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/python/ql/src/semmle/python/RegexParserExtended.qll b/python/ql/src/semmle/python/RegexParserExtended.qll index 3f538b054538..62ade6893b1b 100644 --- a/python/ql/src/semmle/python/RegexParserExtended.qll +++ b/python/ql/src/semmle/python/RegexParserExtended.qll @@ -7,17 +7,23 @@ module RegexSpecific { import RegexSpecific as Conf +private string escapableChars() { result = "AbBdDsSwWZafnNrtuUvx\\\\" } + +private string keywordChars() { result = "()|*+?\\-\\[\\]" } + class RegexParserConfiguration extends ParserConfiguration { RegexParserConfiguration() { this = "Extended regex parser configuration" } override predicate hasTokenRegex(string regex) { - regex = "[()|*+?\\-\\[\\]]" + regex = "[" + keywordChars() + "]" or regex = "\\[\\^" } override predicate hasTokenRegex(string regex, string id) { - regex = "[^()|.$\\^\\[\\]\\\\]" and id = "normalchar" + regex = "[^" + keywordChars() + ".$\\^\\\\]" and id = "normalchar" + or + regex = "\\\\[^" + escapableChars() + "0-9]" and id = "normalchar" or regex = "\\\\[0-9]+" and id = "backref" or @@ -37,9 +43,7 @@ class RegexParserConfiguration extends ParserConfiguration { or regex = "\\{[0-9]+,\\}" and id = "openrepeat" or - regex = "\\\\[^AbBdDsSwWZafnNrtuUvx\\\\0-9]" and id = "normalchar" - or - regex = "\\\\[AbBdDsSwWZafnNrtuUvx\\\\]" and id = "escclass" + regex = "\\\\[" + escapableChars() + "]" and id = "escclass" or regex = "\\(\\?[aiLmsux]+\\)" and id = "confgroup" or @@ -504,14 +508,38 @@ class ParsedRegex extends Regex { string testTokenize(ParsedString text, string id, int pos, int seq) { // text.toString() = "\\|\\[\\][123]|\\{\\}" and // text.toString() = "\\A[+-]?\\d+" and - text.toString() = "\\[(?P[^[]*)\\]\\((?P[^)]*)" and + // text.toString() = "\\[(?P[^[]*)\\]\\((?P[^)]*)" and // text.toString() = "(?m)^(?!$)" and + text.toString() = "^\\b_((?:__|[^_])+?)_\\b|^\\*((?:\\*\\*|[^*])+?)\\*(?!\\*)" and result = tokenize(text, id, pos, seq) } -predicate testRegex() { - "(?P".regexpMatch("\\(\\?P<\\w+>") +string canParse(ParsedString text) { result = text.toString() } + +predicate testTokenRegex(string text, string kind) { + // "(?P".regexpMatch("\\(\\?P<\\w+>") + // "n1".regexpMatch("\\w+") + exists(string regex | + any(RegexParserConfiguration c).hasTokenRegex(regex, kind) and + text.regexpMatch(regex) + ) and + text = "_" and + kind = "normalchar" +} + +predicate testT(ParsedString text, int length, string failedAt) { + unsuccessfullyTokenized(text, length, failedAt) //and + // text.toString() = "^\\b_((?:__|[^_])+?)_\\b|^\\*((?:\\*\\*|[^*])+?)\\*(?!\\*)" +} + +predicate testKeywordRegex(string text) { + // "(?P".regexpMatch("\\(\\?P<\\w+>") // "n1".regexpMatch("\\w+") + exists(string regex | + any(RegexParserConfiguration c).hasTokenRegex(regex) and + text.regexpMatch(regex) + ) and + text = "(" } predicate testParse(ParsedString s, int start, int next, string id) { @@ -521,3 +549,8 @@ predicate testParse(ParsedString s, int start, int next, string id) { s = "012345678" and s.nodes(start, next, id) } + +string testRawTokens(ParsedString s, int pos, string id) { + s.toString() = "^\\b_((?:__|[^_])+?)_\\b|^\\*((?:\\*\\*|[^*])+?)\\*(?!\\*)" and + result = s.tokens(pos, id) +} From d0f2857f881ecb04912de9247e245a5d3b41a348 Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Wed, 26 May 2021 09:53:34 +0200 Subject: [PATCH 8/8] Python: collecting constants part I not having single char constants yet all redos results disappeared --- .../src/semmle/python/RegexParserExtended.qll | 101 ++++++++++++++---- python/ql/src/semmle/python/RegexTreeView.qll | 28 +++-- 2 files changed, 95 insertions(+), 34 deletions(-) diff --git a/python/ql/src/semmle/python/RegexParserExtended.qll b/python/ql/src/semmle/python/RegexParserExtended.qll index 62ade6893b1b..aba7502c2599 100644 --- a/python/ql/src/semmle/python/RegexParserExtended.qll +++ b/python/ql/src/semmle/python/RegexParserExtended.qll @@ -72,13 +72,22 @@ class RegexParserConfiguration extends ParserConfiguration { * | orregex '|' seqregex * | '|' seqregex * | orregex '|' - * seqregex -> primary + * seqregex -> constant + * | primaryseqregex + * | constant primaryseqregex + * primaryseqregex -> // seqregex starting with a primary + * | primary * | primary seqregex - * primary -> group + * constant -> char + * | char constant + * char -> normalchar + * | '-' + * | ']' + * primary -> // not a constant + * | group * | primary '*' * | primary '+' * | primary '?' - * | char * | class * | escclass * group -> '(' regex ')' @@ -102,6 +111,7 @@ class RegexParserConfiguration extends ParserConfiguration { * | classpart_c * | classpart_c- * | '-' + * | '-' classinner1 * classpart_c -> clschar * classpart_c clschar * classpart clschar @@ -115,7 +125,7 @@ class RegexParserConfiguration extends ParserConfiguration { * classrange -> clschar '-' clschar * clschar -> normalchar * | anychar - * | '(', ')', '|', '+', '*', '?' + * | '(', ')', '|', '+', '*', '?', '$' * * Things that currently don't parse: * - Empty regexes (as standalone empty strings, or part of a group, e.g. `()`) @@ -125,13 +135,15 @@ class RegexParserConfiguration extends ParserConfiguration { */ override string rule(string a) { - a in ["char", "anychar", "dollar", "caret", "backref", "class", "escclass", "group"] and + a in ["anychar", "dollar", "caret", "backref", "class", "escclass", "group"] and result = "primary" or - a = "primary" and result = "seqregex" + a in ["constant", "primaryseqregex"] and result = "seqregex" + or + a = "char" and result = "constant" + or + a = "primary" and result = "primaryseqregex" or - // a = "orregex|" and result = "orregex" - // or a = "seqregex" and result = "orregex" or a = "orregex" and result = "regex" @@ -141,7 +153,7 @@ class RegexParserConfiguration extends ParserConfiguration { a in ["normalchar", "-", "]"] and result = "char" or - a in ["normalchar", "anychar", "()|+*?[".charAt(_)] and result = "clschar" + a in ["normalchar", "anychar", "()|+*?[$".charAt(_)] and result = "clschar" or a in ["classstart", "classinner1"] and result = "classinner" or @@ -159,7 +171,11 @@ class RegexParserConfiguration extends ParserConfiguration { or a = "orregex" and b = "|" and result = "orregex" or - a = "primary" and b = "seqregex" and result = "seqregex" + a = "constant" and b = "primaryseqregex" and result = "seqregex" + or + a = "primary" and b = "seqregex" and result = "primaryseqregex" + or + a = "char" and b = "constant" and result = "constant" or a = "primary" and b = "*" and result = "primary" or @@ -175,12 +191,28 @@ class RegexParserConfiguration extends ParserConfiguration { or a = "primary" and b = "openrepeat" and result = "primary" or + a = "constant" and b = "*" and result = "primary" + or + a = "constant" and b = "+" and result = "primary" + or + a = "constant" and b = "?" and result = "primary" + or + a = "constant" and b = "fixedrepeat" and result = "primary" + or + a = "constant" and b = "rangerepeat" and result = "primary" + or + a = "constant" and b = "uptorepeat" and result = "primary" + or + a = "constant" and b = "openrepeat" and result = "primary" + or a in ["[", "[^"] and b = "]" and Conf::allowedEmptyClasses() and result = "class" or a = "classstart" and b = "classinner1" and result = "classinner" or a = "classpart" and b = "classinner1" and result = "classinner1" or + a = "-" and b = "classinner1" and result = "classinner1" + or a in ["classpart", "classpart_c"] and b = "clschar" and result = "classpart_c" or a = "classpart_c" and b = "-" and result = "classpart_c-" @@ -304,6 +336,18 @@ class SequenceRegex extends Regex { Regex getRight() { result = this.getRightNode() } } +class ConstantRegex extends Regex { + ConstantRegex() { + this.getId() = "charconstant" and not this.getParent().getId() = "charconstant" + } +} + +predicate isConst(Regex node, Node parent, string parentId) { + node.getId() = "charconstant" and + parent = node.getParent() and + parent.getId() = parentId +} + abstract class SuffixRegex extends Regex { Regex getBody() { if this.isNonGreedy() @@ -332,9 +376,13 @@ class StarRegex extends UnboundedRegex { boolean nonGreedy; StarRegex() { - id = "primary*" and not this.getParent().getId() = "primary?" and nonGreedy = false + id = ["primary", "constant"] + "*" and + not this.getParent().getId() = "primary?" and + nonGreedy = false or - id = "primary?" and this.getLeftNode().getId() = "primary*" and nonGreedy = true + id = ["primary", "constant"] + "?" and + this.getLeftNode().getId() = "primary*" and + nonGreedy = true } override predicate isMaybeEmpty() { any() } @@ -346,9 +394,13 @@ class PlusRegex extends UnboundedRegex { boolean nonGreedy; PlusRegex() { - id = "primary+" and not this.getParent().getId() = "primary?" and nonGreedy = false + id = ["primary", "constant"] + "+" and + not this.getParent().getId() = "primary?" and + nonGreedy = false or - id = "primary?" and this.getLeftNode().getId() = "primary+" and nonGreedy = true + id = ["primary", "constant"] + "?" and + this.getLeftNode().getId() = "primary+" and + nonGreedy = true } override predicate isMaybeEmpty() { none() } @@ -357,7 +409,7 @@ class PlusRegex extends UnboundedRegex { } class FixedRepeatRegex extends SuffixRegex, RepeatRegex { - FixedRepeatRegex() { id = "primaryfixedrepeat" } + FixedRepeatRegex() { id = ["primary", "constant"] + "fixedrepeat" } override int getLowerBound() { exists(string suff, string num | @@ -371,7 +423,7 @@ class FixedRepeatRegex extends SuffixRegex, RepeatRegex { } class UptoRepeatRegex extends SuffixRegex, RepeatRegex { - UptoRepeatRegex() { id = "primaryuptorepeat" } + UptoRepeatRegex() { id = ["primary", "constant"] + "uptorepeat" } override int getLowerBound() { result = 0 } @@ -385,7 +437,7 @@ class UptoRepeatRegex extends SuffixRegex, RepeatRegex { } class RangeRegex extends SuffixRegex, RepeatRegex { - RangeRegex() { id = "primaryrangerepeat" } + RangeRegex() { id = ["primary", "constant"] + "rangerepeat" } override int getLowerBound() { exists(string suff, string numl | @@ -405,7 +457,7 @@ class RangeRegex extends SuffixRegex, RepeatRegex { } class OpenRepeatRegex extends UnboundedRegex, RepeatRegex { - OpenRepeatRegex() { id = "primaryopenrepeat" } + OpenRepeatRegex() { id = ["primary", "constant"] + "openrepeat" } override int getLowerBound() { exists(string suff, string num | @@ -422,10 +474,12 @@ class OptionalRegex extends SuffixRegex { boolean nonGreedy; OptionalRegex() { - id = "primary?" and - not this.getLeftNode().getId() = "primary*" and - not this.getLeftNode().getId() = "primary+" and - if this.getLeftNode().getId() = "primary?" then nonGreedy = true else nonGreedy = false + id = ["primary", "constant"] + "?" and + not this.getLeftNode().getId() = ["primary", "constant"] + "*" and + not this.getLeftNode().getId() = ["primary", "constant"] + "+" and + if this.getLeftNode().getId() = ["primary", "constant"] + "?" + then nonGreedy = true + else nonGreedy = false } override predicate isMaybeEmpty() { any() } @@ -510,7 +564,8 @@ string testTokenize(ParsedString text, string id, int pos, int seq) { // text.toString() = "\\A[+-]?\\d+" and // text.toString() = "\\[(?P[^[]*)\\]\\((?P[^)]*)" and // text.toString() = "(?m)^(?!$)" and - text.toString() = "^\\b_((?:__|[^_])+?)_\\b|^\\*((?:\\*\\*|[^*])+?)\\*(?!\\*)" and + // text.toString() = "^\\b_((?:__|[^_])+?)_\\b|^\\*((?:\\*\\*|[^*])+?)\\*(?!\\*)" and + text.toString() = "^[\\_$a-z][\\_$a-z0-9]*(\\[.*?\\])*(\\.[\\_$a-z][\\_$a-z0-9]*(\\[.*?\\])*)*$" and result = tokenize(text, id, pos, seq) } diff --git a/python/ql/src/semmle/python/RegexTreeView.qll b/python/ql/src/semmle/python/RegexTreeView.qll index f41a2c7e7a17..8504866ead6d 100644 --- a/python/ql/src/semmle/python/RegexTreeView.qll +++ b/python/ql/src/semmle/python/RegexTreeView.qll @@ -175,20 +175,26 @@ class RegExpOpt extends RegExpQuantifier { } // TODO: This is supposed to be a constant sequence. -class RegExpConstant extends RegExpTerm { - RegExpConstant() { - this = TRegExp(node.(P::ChRegex)) - or - this = TClassChar(node) - } +// class RegExpConstant extends RegExpTerm { +// RegExpConstant() { +// this = TRegExp(node.(P::ChRegex)) +// or +// this = TClassChar(node) +// } +// predicate isCharacter() { any() } +// string getValue() { +// result = node.(P::ChRegex).getChar() +// or +// result = node.(P::ClassChar).getChar() +// } +// override string getPrimaryQLClass() { result = "RegExpConstant" } +// } +class RegExpConstant extends NormalRegExpTerm { + override P::ConstantRegex node; predicate isCharacter() { any() } - string getValue() { - result = node.(P::ChRegex).getChar() - or - result = node.(P::ClassChar).getChar() - } + string getValue() { result = node.getText() } override string getPrimaryQLClass() { result = "RegExpConstant" } }