github · semmle-qlci · Jun 3, 2019 · May 27, 2019 · May 31, 2019 · May 31, 2019
diff --git a/change-notes/1.21/analysis-javascript.md b/change-notes/1.21/analysis-javascript.md
@@ -27,6 +27,7 @@
 
 | **Query**                                     | **Tags**                                             | **Purpose**                                                                                                                                                                 |
 |-----------------------------------------------|------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Missing regular expression anchor (`js/regex/missing-regexp-anchor`) | correctness, security, external/cwe/cwe-20 | Highlights regular expression patterns that may be missing an anchor, indicating a possible violation of [CWE-20](https://cwe.mitre.org/data/definitions/20.html). Results are not shown on LGTM by default. |
 | Prototype pollution (`js/prototype-pollution`)    | security, external/cwe-250, external/cwe-400 | Highlights code that allows an attacker to modify a built-in prototype object through an unsanitized recursive merge function. The results are shown on LGTM by default. |
 
 ## Changes to existing queries

diff --git a/javascript/config/suites/javascript/security b/javascript/config/suites/javascript/security
@@ -3,6 +3,7 @@
 + semmlecode-javascript-queries/Security/CWE-020/IncompleteHostnameRegExp.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-020/IncompleteUrlSubstringSanitization.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-020/IncorrectSuffixCheck.ql: /Security/CWE/CWE-020
++ semmlecode-javascript-queries/Security/CWE-020/MissingRegExpAnchor.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-022/TaintedPath.ql: /Security/CWE/CWE-022
 + semmlecode-javascript-queries/Security/CWE-022/ZipSlip.ql: /Security/CWE/CWE-022
 + semmlecode-javascript-queries/Security/CWE-078/CommandInjection.ql: /Security/CWE/CWE-078

diff --git a/javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql b/javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
@@ -12,38 +12,6 @@
 
 import javascript
 
-/**
- * Gets a node whose value may flow (inter-procedurally) to a position where it is interpreted
- * as a regular expression.
- */
-DataFlow::Node regExpSource(DataFlow::Node re, DataFlow::TypeBackTracker t) {
-  t.start() and
-  re = result and
-  isInterpretedAsRegExp(result)
-  or
-  exists(DataFlow::TypeBackTracker t2, DataFlow::Node succ | succ = regExpSource(re, t2) |
-    t2 = t.smallstep(result, succ)
-    or
-    any(TaintTracking::AdditionalTaintStep dts).step(result, succ) and
-    t = t2
-  )
-}
-
-DataFlow::Node regExpSource(DataFlow::Node re) {
-  result = regExpSource(re, DataFlow::TypeBackTracker::end())
-}
-
-/** Holds if `re` is a regular expression with value `pattern`. */
-predicate regexp(DataFlow::Node re, string pattern, string kind, DataFlow::Node aux) {
-  re.asExpr().(RegExpLiteral).getValue() = pattern and
-  kind = "regular expression" and
-  aux = re
-  or
-  re = regExpSource(aux) and
-  pattern = re.getStringValue() and
-  kind = "string, which is used as a regular expression $@,"
-}
-
 /**
  * Holds if `pattern` is a regular expression pattern for URLs with a host matched by `hostPart`,
  * and `pattern` contains a subtle mistake that allows it to match unexpected hosts.
@@ -58,11 +26,21 @@ predicate isIncompleteHostNameRegExpPattern(string pattern, string hostPart) {
             "([():|?a-z0-9-]+(\\\\)?[.]" + RegExpPatterns::commonTLD() + ")" + ".*", 1)
 }
 
-from DataFlow::Node re, string pattern, string hostPart, string kind, DataFlow::Node aux
-where regexp(re, pattern, kind, aux) and
+from RegExpPatternSource re, string pattern, string hostPart, string kind, DataFlow::Node aux
+where
+  pattern = re.getPattern() and
   isIncompleteHostNameRegExpPattern(pattern, hostPart) and
+  (
+    if re.getAParse() != re
+    then (
+      kind = "string, which is used as a regular expression $@," and
+      aux = re.getAParse()
+    ) else (
+      kind = "regular expression" and aux = re
+    )
+  ) and
   // ignore patterns with capture groups after the TLD
   not pattern.regexpMatch("(?i).*[.](" + RegExpPatterns::commonTLD() + ").*[(][?]:.*[)].*")
 select re,
   "This " + kind + " has an unescaped '.' before '" + hostPart +
-    "', so it might match more hosts than expected.", aux, "here"
+    "', so it might match more hosts than expected.", aux, "here"
diff --git a/javascript/ql/src/Security/CWE-020/MissingRegExpAnchor.qhelp b/javascript/ql/src/Security/CWE-020/MissingRegExpAnchor.qhelp
@@ -0,0 +1,77 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+
+	<overview>
+		<p>
+
+			Sanitizing untrusted input with regular expressions is a
+			common technique.  However, it is error-prone to match untrusted input
+			against regular expressions without anchors such as <code>^</code> or
+			<code>$</code>.  Malicious input can bypass such security checks by
+			embedding one of the allowed patterns in an unexpected location.
+
+		</p>
+
+		<p>
+
+			Even if the matching is not done in a security-critical
+			context, it may still cause undesirable behavior when the regular
+			expression accidentally matches.
+
+		</p>
+	</overview>
+
+	<recommendation>
+		<p>
+
+			Use anchors to ensure that regular expressions match at
+			the expected locations.
+
+		</p>
+	</recommendation>
+
+	<example>
+
+		<p>
+
+			The following example code checks that a URL redirection
+			will reach the <code>example.com</code> domain, or one of its
+			subdomains, and not some malicious site.
+
+		</p>
+
+		<sample src="examples/MissingRegExpAnchor_BAD.js"/>
+
+		<p>
+
+			The check with the regular expression match is, however, easy to bypass. For example
+			by embedding <code>example.com</code> in the path component:
+			<code>http://evil-example.net/example.com</code>, or in the query
+			string component: <code>http://evil-example.net/?x=example.com</code>.
+
+			Address these shortcomings by using anchors in the regular expression instead:
+
+		</p>
+
+		<sample src="examples/MissingRegExpAnchor_GOOD.js"/>
+
+		<p>
+
+			A related mistake is to write a regular expression with
+			multiple alternatives, but to only include an anchor for one of the
+			alternatives. As an example, the regular expression
+			<code>/^www\\.example\\.com|beta\\.example\\.com/</code> will match the host
+			<code>evil.beta.example.com</code> because the regular expression is parsed
+			as <code>/(^www\\.example\\.com)|(beta\\.example\\.com)/</code>
+
+		</p>
+	</example>
+
+	<references>
+		<li>MDN: <a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions">Regular Expressions</a></li>
+		<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
+		<li>OWASP: <a href="https://www.owasp.org/index.php/Unvalidated_Redirects_and_Forwards_Cheat_Sheet">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
+	</references>
+</qhelp>
diff --git a/javascript/ql/src/Security/CWE-020/MissingRegExpAnchor.ql b/javascript/ql/src/Security/CWE-020/MissingRegExpAnchor.ql
@@ -0,0 +1,83 @@
+/**
+ * @name Missing regular expression anchor
+ * @description Regular expressions without anchors can be vulnerable to bypassing.
+ * @kind problem
+ * @problem.severity warning
+ * @precision medium
+ * @id js/regex/missing-regexp-anchor
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-20
+ */
+
+import javascript
+
+/**
+ * Holds if `src` is a pattern for a collection of alternatives where
+ * only the first or last alternative is anchored, indicating a
+ * precedence mistake explained by `msg`.
+ *
+ * The canonical example of such a mistake is: `^a|b|c`, which is
+ * parsed as `(^a)|(b)|(c)`.
+ */
+predicate isInterestingSemiAnchoredRegExpString(RegExpPatternSource src, string msg) {
+  exists(string str, string maybeGroupedStr, string regex, string anchorPart, string escapedDot |
+    // a dot that might be escaped in a regular expression, for example `/\./` or `new RegExp('\\.')`
+    escapedDot = "\\\\[.]" and
+    // a string that is mostly free from special reqular expression symbols
+    str = "(?:(?:" + escapedDot + ")|[a-z:/.?_,@0-9 -])+" and
+    // the string may be wrapped in parentheses
+    maybeGroupedStr = "(?:" + str + "|\\(" + str + "\\))" and
+    (
+      // a problematic pattern: `^a|b|...|x`
+      regex = "(?i)(\\^" + maybeGroupedStr + ")(?:\\|" + maybeGroupedStr + ")+"
+      or
+      // a problematic pattern: `a|b|...|x$`
+      regex = "(?i)(?:" + maybeGroupedStr + "\\|)+(" + maybeGroupedStr + "\\$)"
+    ) and
+    anchorPart = src.getPattern().regexpCapture(regex, 1) and
+    anchorPart.regexpMatch("(?i).*[a-z].*") and
+    msg = "Misleading operator precedence. The subexpression '" + anchorPart + "' is anchored, but the other parts of this regular expression are not"
+  )
+}
+
+/**
+ * Holds if `src` is an unanchored pattern for a URL, indicating a
+ * mistake explained by `msg`.
+ */
+predicate isInterestingUnanchoredRegExpString(RegExpPatternSource src, string msg) {
+  exists(string pattern | pattern = src.getPattern() |
+    // a substring sequence of a protocol and subdomains, perhaps with some regex characters mixed in, followed by a known TLD
+    pattern
+        .regexpMatch("(?i)[():|?a-z0-9-\\\\./]+[.]" + RegExpPatterns::commonTLD() +
+            "([/#?():]\\S*)?") and
+    // without any anchors
+    pattern.regexpMatch("[^$^]+") and
+    // that is not used for capture or replace
+    not exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() |
+      name = "exec" and
+      mcn = src.getARegExpObject().getAMethodCall() and
+      exists(mcn.getAPropertyRead())
+      or
+      exists(DataFlow::Node arg |
+        arg = mcn.getArgument(0) and
+        (
+          src.getARegExpObject().flowsTo(arg) or
+          src.getAParse() = arg
+        )
+      |
+        name = "replace"
+        or
+        name = "match" and exists(mcn.getAPropertyRead())
+      )
+    ) and
+    msg = "When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it."
+  )
+}
+
+from DataFlow::Node nd, string msg
+where
+  isInterestingUnanchoredRegExpString(nd, msg)
+  or
+  isInterestingSemiAnchoredRegExpString(nd, msg)
+select nd, msg
diff --git a/javascript/ql/src/Security/CWE-020/examples/IncompleteHostnameRegExp.js b/javascript/ql/src/Security/CWE-020/examples/IncompleteHostnameRegExp.js
@@ -2,7 +2,7 @@ app.get('/some/path', function(req, res) {
     let url = req.param('url'),
         host = urlLib.parse(url).host;
     // BAD: the host of `url` may be controlled by an attacker
-    let regex = /((www|beta).)?example.com/;
+    let regex = /^((www|beta).)?example.com/;
     if (host.match(regex)) {
         res.redirect(url);
     }

diff --git a/javascript/ql/src/Security/CWE-020/examples/MissingRegExpAnchor_BAD.js b/javascript/ql/src/Security/CWE-020/examples/MissingRegExpAnchor_BAD.js
@@ -0,0 +1,7 @@
+app.get("/some/path", function(req, res) {
+    let url = req.param("url");
+    // BAD: the host of `url` may be controlled by an attacker
+    if (url.match(/https?:\/\/www\.example\.com\//)) {
+        res.redirect(url);
+    }
+});
diff --git a/javascript/ql/src/Security/CWE-020/examples/MissingRegExpAnchor_GOOD.js b/javascript/ql/src/Security/CWE-020/examples/MissingRegExpAnchor_GOOD.js
@@ -0,0 +1,7 @@
+app.get("/some/path", function(req, res) {
+    let url = req.param("url");
+    // GOOD: the host of `url` can not be controlled by an attacker
+    if (url.match(/^https?:\/\/www\.example\.com\//)) {
+        res.redirect(url);
+    }
+});
diff --git a/javascript/ql/src/semmle/javascript/Regexp.qll b/javascript/ql/src/semmle/javascript/Regexp.qll
@@ -41,6 +41,9 @@ abstract class RegExpTerm extends Locatable, @regexpterm {
 
   override string toString() { regexpterm(this, _, _, _, result) }
 
+  /** Gets the raw source text of this term. */
+  string getRawValue() { regexpterm(this, _, _, _, result) }
+
   /** Holds if this regular expression term can match the empty string. */
   abstract predicate isNullable();
 
@@ -404,3 +407,94 @@ module RegExpPatterns {
     result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
   }
 }
+
+/**
+ * Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted
+ * as a part of a regular expression.
+ */
+private DataFlow::Node regExpSource(DataFlow::Node re, DataFlow::TypeBackTracker t) {
+  t.start() and
+  re = result and
+  isInterpretedAsRegExp(result)
+  or
+  exists(DataFlow::TypeBackTracker t2, DataFlow::Node succ | succ = regExpSource(re, t2) |
+    t2 = t.smallstep(result, succ)
+    or
+    any(TaintTracking::AdditionalTaintStep dts).step(result, succ) and
+    t = t2
+  )
+}
+
+/**
+ * Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted
+ * as a part of a regular expression.
+ */
+private DataFlow::Node regExpSource(DataFlow::Node re) {
+  result = regExpSource(re, DataFlow::TypeBackTracker::end())
+}
+
+/**
+ * A node whose value may flow to a position where it is interpreted
+ * as a part of a regular expression.
+ */
+abstract class RegExpPatternSource extends DataFlow::Node {
+  /**
+   * Gets a node where the pattern of this node is parsed as a part of
+   * a regular expression.
+   */
+  abstract DataFlow::Node getAParse();
+
+  /**
+   * Gets the pattern of this node that is interpreted as a part of a
+   * regular expression.
+   */
+  abstract string getPattern();
+
+  /**
+   * Gets a regular expression object that is constructed from the pattern
+   * of this node.
+   */
+  abstract DataFlow::SourceNode getARegExpObject();
+}
+
+/**
+ * A regular expression literal, viewed as the pattern source for itself.
+ */
+private class RegExpLiteralPatternSource extends RegExpPatternSource {
+  string pattern;
+
+  RegExpLiteralPatternSource() {
+    exists(string raw | raw = asExpr().(RegExpLiteral).getRoot().getRawValue() |
+      // hide the fact that `/` is escaped in the literal
+      pattern = raw.regexpReplaceAll("\\\\/", "/")
+    )
+  }
+
+  override DataFlow::Node getAParse() { result = this }
+
+  override string getPattern() { result = pattern }
+
+  override DataFlow::SourceNode getARegExpObject() { result = this }
+}
+
+/**
+ * A node whose string value may flow to a position where it is interpreted
+ * as a part of a regular expression.
+ */
+private class StringRegExpPatternSource extends RegExpPatternSource {
+  DataFlow::Node parse;
+
+  StringRegExpPatternSource() { this = regExpSource(parse) }
+
+  override DataFlow::Node getAParse() { result = parse }
+
+  override DataFlow::SourceNode getARegExpObject() {
+    exists(DataFlow::InvokeNode constructor |
+      constructor = DataFlow::globalVarRef("RegExp").getAnInvocation() and
+      parse = constructor.getArgument(0) and
+      result = constructor
+    )
+  }
+
+  override string getPattern() { result = getStringValue() }
+}