Merge remote-tracking branch 'upstream/master' into mergeback-20181217

2026-05-05 21:55:19 +02:00 · 2018-12-17 13:42:45 +01:00
parent 01f58758f1 7adf1d9958
commit 5ac5aa0c2a
12 changed files with 262 additions and 41 deletions
--- a/change-notes/1.20/analysis-javascript.md
+++ b/change-notes/1.20/analysis-javascript.md
@@ -15,6 +15,7 @@
 | **Query**                                     | **Tags**                                             | **Purpose**                                                                                                                                                                 |
 |-----------------------------------------------|------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | Double escaping or unescaping (`js/double-escaping`) | correctness, security, external/cwe/cwe-116 | Highlights potential double escaping or unescaping of special characters, indicating a possible violation of [CWE-116](https://cwe.mitre.org/data/definitions/116.html). Results are shown on LGTM by default. |
+| Incomplete regular expression for hostnames (`js/incomplete-hostname-regexp`) | correctness, security, external/cwe/cwe-020 |  Highlights hostname sanitizers that are likely to be incomplete, indicating a violation of [CWE-020](https://cwe.mitre.org/data/definitions/20.html). Results are shown on LGTM by default.|
 | Incomplete URL substring sanitization | correctness, security, external/cwe/cwe-020 | Highlights URL sanitizers that are likely to be incomplete, indicating a violation of [CWE-020](https://cwe.mitre.org/data/definitions/20.html). Results shown on LGTM by default. |
 | Incorrect suffix check (`js/incorrect-suffix-check`) | correctness, security, external/cwe/cwe-020 | Highlights error-prone suffix checks based on `indexOf`, indicating a potential violation of [CWE-20](https://cwe.mitre.org/data/definitions/20.html). Results are shown on LGTM by default. |
 | Useless comparison test (`js/useless-comparison-test`) | correctness | Highlights code that is unreachable due to a numeric comparison that is always true or always false. Results are shown on LGTM by default. |
--- a/cpp/ql/src/semmle/code/cpp/security/CommandExecution.qll
+++ b/cpp/ql/src/semmle/code/cpp/security/CommandExecution.qll
@@ -159,17 +159,6 @@ predicate shellCommandPreface(string cmd, string flag) {
  )
 }

-/**
- * An array element. This supports multiple kinds of array syntax.
- */
-private predicate arrayElement(Expr arrayLit, int idx, Expr element) {
-  exists (ArrayLiteral lit | lit = arrayLit |
-    lit.getElement(idx) = element)
-  or exists (MessageExpr arrayWithObjects | arrayWithObjects = arrayLit |
-    arrayWithObjects.getStaticTarget().getQualifiedName().matches("NSArray%::+arrayWithObjects:") and
-    arrayWithObjects.getArgument(idx) = element)
-}
-
 /**
 * A command that is used as a command, or component of a command,
 * that will be executed by a general-purpose command interpreter
--- a/javascript/config/suites/javascript/security
+++ b/javascript/config/suites/javascript/security
@@ -1,5 +1,6 @@
 + semmlecode-javascript-queries/DOM/TargetBlank.ql: /Security/CWE/CWE-200
 + semmlecode-javascript-queries/Electron/EnablingNodeIntegration.ql: /Security/CWE/CWE-094
+ semmlecode-javascript-queries/Security/CWE-020/IncompleteHostnameRegExp.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-020/IncompleteUrlSubstringSanitization.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-020/IncorrectSuffixCheck.ql: /Security/CWE/CWE-020
 + semmlecode-javascript-queries/Security/CWE-022/TaintedPath.ql: /Security/CWE/CWE-022
--- a/javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp
+++ b/javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp
@@ -0,0 +1,72 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+
+	<overview>
+		<p>
+
+			Sanitizing untrusted URLs is an important technique for
+			preventing attacks such as request forgeries and malicious
+			redirections. Often, this is done by checking that the host of a URL
+			is in a set of allowed hosts.
+
+		</p>
+
+		<p>
+
+			If a regular expression implements such a check, it is
+			easy to accidentally make the check too permissive by not escaping the
+			<code>.</code> meta-characters appropriately.
+
+			Even if the check is not used in a security-critical
+			context, the incomplete check may still cause undesirable behaviors
+			when it accidentally succeeds.
+
+		</p>
+	</overview>
+
+	<recommendation>
+		<p>
+
+			Escape all meta-characters appropriately when constructing
+			regular expressions for security checks, pay special attention to the
+			<code>.</code> meta-character.
+
+		</p>
+	</recommendation>
+
+	<example>
+
+		<p>
+
+			The following example code checks that a URL redirection
+			will reach the <code>example.com</code> domain, or one of its
+			subdomains.
+
+		</p>
+
+		<sample src="examples/IncompleteHostnameRegExp.js"/>
+
+		<p>
+
+			The check is however easy to bypass because the unescaped
+			<code>.</code> allows for any character before
+			<code>example.com</code>, effectively allowing the redirect to go to
+			an attacker-controlled domain such as <code>wwwXexample.com</code>.
+
+		</p>
+		<p>
+
+			Address this vulnerability by escaping <code>.</code>
+			appropriately: <code>let regex = /(www|beta|)\.example\.com/</code>.
+
+		</p>
+
+	</example>
+
+	<references>
+		<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
+		<li>OWASP: <a href="https://www.owasp.org/index.php/Unvalidated_Redirects_and_Forwards_Cheat_Sheet">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
+	</references>
+</qhelp>
--- a/javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
+++ b/javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
@@ -0,0 +1,64 @@
+/**
+ * @name Incomplete regular expression for hostnames
+ * @description Matching a URL or hostname against a regular expression that contains an unescaped dot as part of the hostname might match more hostnames than expected.
+ * @kind problem
+ * @problem.severity warning
+ * @precision high
+ * @id js/incomplete-hostname-regexp
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-20
+ */
+
+import javascript
+
+/**
+ * A taint tracking configuration for incomplete hostname regular expressions sources.
+ */
+class Configuration extends TaintTracking::Configuration {
+  Configuration() { this = "IncompleteHostnameRegExpTracking" }
+
+  override
+  predicate isSource(DataFlow::Node source) {
+    isIncompleteHostNameRegExpPattern(source.asExpr().getStringValue(), _)
+  }
+
+  override
+  predicate isSink(DataFlow::Node sink) {
+    isInterpretedAsRegExp(sink)
+  }
+
+}
+
+
+/**
+ * Holds if `pattern` is a regular expression pattern for URLs with a host matched by `hostPart`,
+ * and `pattern` contains a subtle mistake that allows it to match unexpected hosts.
+ */
+bindingset[pattern]
+predicate isIncompleteHostNameRegExpPattern(string pattern, string hostPart) {
+  hostPart = pattern.regexpCapture(
+    "(?i).*" +
+    // an unescaped single `.`
+    "(?<!\\\\)[.]" +
+    // immediately followed by a sequence of subdomains, perhaps with some regex characters mixed in, followed by a known TLD
+    "([():|?a-z0-9-]+(\\\\)?[.](" + RegExpPatterns::commonTLD() + "))" +
+    ".*", 1)
+}
+
+from Expr e, string pattern, string hostPart
+where
+      (
+        e.(RegExpLiteral).getValue() = pattern or
+        exists (Configuration cfg |
+          cfg.hasFlow(e.flow(), _) and
+          e.mayHaveStringValue(pattern)
+        )
+      ) and
+      isIncompleteHostNameRegExpPattern(pattern, hostPart)
+      and
+      // ignore patterns with capture groups after the TLD
+      not pattern.regexpMatch("(?i).*[.](" + RegExpPatterns::commonTLD() + ").*[(][?]:.*[)].*")
+
+
+select e, "This regular expression has an unescaped '.' before '" + hostPart + "', so it might match more hosts than expected."
--- a/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql
+++ b/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql
@@ -21,7 +21,7 @@ where
      substring.mayHaveStringValue(target) and
      (
        // target contains a domain on a common TLD, and perhaps some other URL components
-        target.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+(com|org|edu|gov|uk|net)(:[0-9]+)?/?") or
+        target.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+(" + RegExpPatterns::commonTLD() + ")(:[0-9]+)?/?") or
        // target is a HTTP URL to a domain on any TLD
        target.regexpMatch("(?i)https?://([a-z0-9-]+\\.)+([a-z]+)(:[0-9]+)?/?")
      ) and
--- a/javascript/ql/src/Security/CWE-020/examples/IncompleteHostnameRegExp.js
+++ b/javascript/ql/src/Security/CWE-020/examples/IncompleteHostnameRegExp.js
@@ -0,0 +1,9 @@
+app.get('/some/path', function(req, res) {
+    let url = req.param('url'),
+        host = urlLib.parse(url).host;
+    // BAD: the host of `url` may be controlled by an attacker
+    let regex = /(www|beta|).example.com/;
+    if (host.match(regex)) {
+        res.redirect(url);
+    }
+});
--- a/javascript/ql/src/semmle/javascript/Regexp.qll
+++ b/javascript/ql/src/semmle/javascript/Regexp.qll
@@ -1,11 +1,12 @@
 /**
- * Provides classes for working with regular expression literals.
+ * Provides classes for working with regular expressions.
 *
- * Regular expressions are represented as an abstract syntax tree of regular expression
+ * Regular expression literals are represented as an abstract syntax tree of regular expression
 * terms.
 */

 import javascript
+private import semmle.javascript.dataflow.InferredTypes

 /**
 * An element containing a regular expression term, that is, either
@@ -484,3 +485,41 @@ class RegExpParseError extends Error, @regexp_parse_error {
    result = getMessage()
  }
 }
+
+/**
+ * Holds if `source` may be interpreted as a regular expression.
+ */
+predicate isInterpretedAsRegExp(DataFlow::Node source) {
+  // The first argument to an invocation of `RegExp` (with or without `new`).
+  source = DataFlow::globalVarRef("RegExp").getAnInvocation().getArgument(0)
+  or
+  // The argument of a call that coerces the argument to a regular expression.
+  exists(MethodCallExpr mce, string methodName |
+    mce.getReceiver().analyze().getAType() = TTString() and
+    mce.getMethodName() = methodName
+    |
+    (methodName = "match" and source.asExpr() = mce.getArgument(0) and mce.getNumArgument() = 1)
+    or
+    (
+      methodName = "search" and
+      source.asExpr() = mce.getArgument(0) and
+      mce.getNumArgument() = 1 and
+      // "search" is a common method name, and so we exclude chained accesses
+      // because `String.prototype.search` returns a number
+      not exists(PropAccess p | p.getBase() = mce)
+    )
+  )
+}
+
+/**
+ * Provides regular expression patterns.
+ */
+module RegExpPatterns {
+  /**
+   * Gets a pattern that matches common top-level domain names.
+   */
+  string commonTLD() {
+    // according to ranking by http://google.com/search?q=site:.<<TLD>>
+    result = "com|org|edu|gov|uk|net|io"
+  }
+}
--- a/javascript/ql/src/semmle/javascript/security/dataflow/RegExpInjection.qll
+++ b/javascript/ql/src/semmle/javascript/security/dataflow/RegExpInjection.qll
@@ -4,7 +4,6 @@
 */

 import javascript
-private import semmle.javascript.dataflow.InferredTypes

 module RegExpInjection {
  /**
@@ -51,36 +50,14 @@ module RegExpInjection {
  }

  /**
-   * The first argument to an invocation of `RegExp` (with or without `new`).
+   * The source string of a regular expression.
   */
-  class RegExpObjectCreationSink extends Sink, DataFlow::ValueNode {
-    RegExpObjectCreationSink() {
-      this = DataFlow::globalVarRef("RegExp").getAnInvocation().getArgument(0)
+  class RegularExpressionSourceAsSink extends Sink {
+    RegularExpressionSourceAsSink() {
+      isInterpretedAsRegExp(this)
    }
  }

-  /**
-   * The argument of a call that coerces the argument to a regular expression.
-   */
-  class RegExpObjectCoercionSink extends Sink {
-
-    RegExpObjectCoercionSink() {
-      exists (MethodCallExpr mce, string methodName |
-        mce.getReceiver().analyze().getAType() = TTString() and
-        mce.getMethodName() = methodName |
-        (methodName = "match" and this.asExpr() = mce.getArgument(0) and mce.getNumArgument() = 1) or
-        (
-           methodName = "search" and
-           this.asExpr() = mce.getArgument(0) and
-           mce.getNumArgument() = 1 and
-           // `String.prototype.search` returns a number, so exclude chained accesses
-           not exists(PropAccess p | p.getBase() = mce)
-        )
-      )
-    }
-
-  }
-
  /**
   * A call to a function whose name suggests that it escapes regular
   * expression meta-characters.
--- a/javascript/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected
+++ b/javascript/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected
@@ -0,0 +1,21 @@
+| tst-IncompleteHostnameRegExp.js:3:2:3:28 | /http:\\ ... le.com/ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:5:2:5:28 | /http:\\ ... le.net/ | This regular expression has an unescaped '.' before 'example.net', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:6:2:6:42 | /http:\\ ... b).com/ | This regular expression has an unescaped '.' before '(example-a\|example-b).com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:11:13:11:37 | "http:/ ... le.com" | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:12:10:12:34 | "http:/ ... le.com" | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:15:22:15:46 | "http:/ ... le.com" | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:17:13:17:31 | `test.example.com$` | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:17:14:17:30 | test.example.com$ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:19:17:19:34 | 'test.example.com' | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:22:27:22:44 | 'test.example.com' | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:28:22:28:39 | 'test.example.com' | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:37:2:37:54 | /^(http ... =$\|\\/)/ | This regular expression has an unescaped '.' before ')?example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:38:2:38:44 | /^(http ... p\\/f\\// | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:39:2:39:34 | /\\(http ... m\\/\\)/g | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:40:2:40:29 | /https? ... le.com/ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:41:13:41:68 | '^http: ... e\\.com' | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:41:41:41:68 | '^https ... e\\.com' | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:42:13:42:61 | 'http[s ... \\/(.+)' | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:43:2:43:33 | /^https ... e.com$/ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:44:9:44:100 | 'protos ... ernal)' | This regular expression has an unescaped '.' before 'example-b.com', so it might match more hosts than expected. |
+| tst-IncompleteHostnameRegExp.js:46:2:46:26 | /exampl ... le.com/ | This regular expression has an unescaped '.' before 'dev\|example.com', so it might match more hosts than expected. |
--- a/javascript/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.qlref
+++ b/javascript/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.qlref
@@ -0,0 +1 @@
+Security/CWE-020/IncompleteHostnameRegExp.ql
--- a/javascript/ql/test/query-tests/Security/CWE-020/tst-IncompleteHostnameRegExp.js
+++ b/javascript/ql/test/query-tests/Security/CWE-020/tst-IncompleteHostnameRegExp.js
@@ -0,0 +1,47 @@
+(function() {
+	/http:\/\/example.com/; // OK
+	/http:\/\/test.example.com/; // NOT OK
+	/http:\/\/test\\.example.com/; // OK
+	/http:\/\/test.example.net/; // NOT OK
+	/http:\/\/test.(example-a|example-b).com/; // NOT OK
+	/http:\/\/(.+)\\.example.com/; // NOT OK, but not yet supported with enough precision
+	/http:\/\/(\\.+)\\.example.com/; // OK
+	/http:\/\/(?:.+)\\.test\\.example.com/; // NOT OK, but not yet supported with enough precision
+	/http:\/\/test.example.com\/(?:.*)/; // OK
+	new RegExp("http://test.example.com"); // NOT OK
+	s.match("http://test.example.com"); // NOT OK
+
+	function id(e) { return e; }
+	new RegExp(id(id(id("http://test.example.com")))); // NOT OK
+
+	new RegExp(`test.example.com$`); // NOT OK
+
+	let hostname = 'test.example.com'; // NOT OK
+	new RegExp(`${hostname}$`);
+
+	let domain = { hostname: 'test.example.com' };
+	new RegExp(domain.hostname);
+
+	function convert(domain) {
+		return new RegExp(domain.hostname);
+	}
+	convert({ hostname: 'test.example.com' }); // NOT OK
+
+	let domains = [ { hostname: 'test.example.com' } ];  // NOT OK, but not yet supported
+	function convert(domain) {
+		return new RegExp(domain.hostname);
+	}
+	domains.map(d => convert(d));
+
+	/(.+\.(?:example-a|example-b)\.com)/; // NOT OK, but not yet supported with enough precision
+	/^(https?:)?\/\/((service|www).)?example.com(?=$|\/)/; // NOT OK
+	/^(http|https):\/\/www.example.com\/p\/f\//; // NOT OK
+	/\(http:\/\/sub.example.com\/\)/g; // NOT OK
+	/https?:\/\/api.example.com/; // NOT OK
+	new RegExp('^http://localhost:8000|' + '^https?://.+\.example\.com'); // NOT OK
+	new RegExp('http[s]?:\/\/?sub1\.sub2\.example\.com\/f\/(.+)'); // NOT OK
+	/^https:\/\/[a-z]*.example.com$/; // NOT OK
+	RegExp('protos?://(localhost|.+.example.net|.+.example-a.com|.+.example-b.com|.+.example.internal)'); // NOT OK
+
+	/example.dev|example.com/; // OK, but still flagged
+});
				`@@ -0,0 +1 @@`
				`Security/CWE-020/IncompleteHostnameRegExp.ql`