Merge remote-tracking branch 'upstream/main' into incomplete-url-string-sanitization

Conflicts: config/identical-files.json javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qll ruby/ql/src/queries/security/cwe-020/IncompleteUrlSubstringSanitization.qll
2025-12-22 11:46:32 +01:00 · 2022-03-18 16:08:59 +01:00
parent f95e1efb67 117fb5be7d
commit bf888f0f0b
1308 changed files with 77789 additions and 53601 deletions
--- a/ruby/ql/src/queries/meta/internal/TaintMetrics.qll
+++ b/ruby/ql/src/queries/meta/internal/TaintMetrics.qll
@@ -25,7 +25,7 @@ DataFlow::Node relevantTaintSink(string kind) {
    or
    kind = "CommandInjection" and result instanceof CommandInjection::Sink
    or
-    kind = "XSS" and result instanceof ReflectedXSS::Sink
+    kind = "XSS" and result instanceof ReflectedXss::Sink
    or
    kind = "PathInjection" and result instanceof PathInjection::Sink
    or
--- a/ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll
+++ b/ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll
@@ -0,0 +1,202 @@
+/**
+ * Provides predicates for reasoning about regular expressions
+ * that match URLs and hostname patterns.
+ */
+
+private import HostnameRegexpSpecific
+
+/**
+ * Holds if the given constant is unlikely to occur in the origin part of a URL.
+ */
+predicate isConstantInvalidInsideOrigin(RegExpConstant term) {
+  // Look for any of these cases:
+  // - A character that can't occur in the origin
+  // - Two dashes in a row
+  // - A colon that is not part of port or scheme separator
+  // - A slash that is not part of scheme separator
+  term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*")
+}
+
+/** Holds if `term` is a dot constant of form `\.` or `[.]`. */
+predicate isDotConstant(RegExpTerm term) {
+  term.(RegExpCharEscape).getValue() = "."
+  or
+  exists(RegExpCharacterClass cls |
+    term = cls and
+    not cls.isInverted() and
+    cls.getNumChild() = 1 and
+    cls.getAChild().(RegExpConstant).getValue() = "."
+  )
+}
+
+/** Holds if `term` is a wildcard `.` or an actual `.` character. */
+predicate isDotLike(RegExpTerm term) {
+  term instanceof RegExpDot
+  or
+  isDotConstant(term)
+}
+
+/** Holds if `term` will only ever be matched against the beginning of the input. */
+predicate matchesBeginningOfString(RegExpTerm term) {
+  term.isRootTerm()
+  or
+  exists(RegExpTerm parent | matchesBeginningOfString(parent) |
+    term = parent.(RegExpSequence).getChild(0)
+    or
+    parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and
+    term = parent.(RegExpSequence).getChild(1)
+    or
+    term = parent.(RegExpAlt).getAChild()
+    or
+    term = parent.(RegExpGroup).getAChild()
+  )
+}
+
+/**
+ * Holds if the given sequence contains top-level domain preceded by a dot, such as `.com`,
+ * excluding cases where this is at the very beginning of the regexp.
+ *
+ * `i` is bound to the index of the last child in the top-level domain part.
+ */
+predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
+  seq.getChild(i)
+      .(RegExpConstant)
+      .getValue()
+      .regexpMatch("(?i)" + RegExpPatterns::getACommonTld() + "(:\\d+)?([/?#].*)?") and
+  isDotLike(seq.getChild(i - 1)) and
+  not (i = 1 and matchesBeginningOfString(seq))
+}
+
+/**
+ * Holds if the given regular expression term contains top-level domain preceded by a dot,
+ * such as `.com`.
+ */
+predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) }
+
+/**
+ * Holds if `term` will always match a hostname, that is, all disjunctions contain
+ * a hostname pattern that isn't inside a quantifier.
+ */
+predicate alwaysMatchesHostname(RegExpTerm term) {
+  hasTopLevelDomainEnding(term, _)
+  or
+  // `localhost` is considered a hostname pattern, but has no TLD
+  term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b")
+  or
+  not term instanceof RegExpAlt and
+  not term instanceof RegExpQuantifier and
+  alwaysMatchesHostname(term.getAChild())
+  or
+  alwaysMatchesHostnameAlt(term)
+}
+
+/** Holds if every child of `alt` contains a hostname pattern. */
+predicate alwaysMatchesHostnameAlt(RegExpAlt alt) {
+  alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1)
+}
+
+/**
+ * Holds if the first `i` children of `alt` contains a hostname pattern.
+ *
+ * This is used instead of `forall` to avoid materializing the set of alternatives
+ * that don't contains hostnames, which is much larger.
+ */
+predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
+  alwaysMatchesHostname(alt.getChild(0)) and i = 0
+  or
+  alwaysMatchesHostnameAlt(alt, i - 1) and
+  alwaysMatchesHostname(alt.getChild(i))
+}
+
+/**
+ * Holds if `term` occurs inside a quantifier or alternative (and thus
+ * can not be expected to correspond to a unique match), or as part of
+ * a lookaround assertion (which are rarely used for capture groups).
+ */
+predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
+  exists(RegExpParent parent | parent = term.getParent() |
+    parent instanceof RegExpAlt
+    or
+    parent instanceof RegExpQuantifier
+    or
+    parent instanceof RegExpSubPattern
+    or
+    isInsideChoiceOrSubPattern(parent)
+  )
+}
+
+/**
+ * Holds if `group` is likely to be used as a capture group.
+ */
+predicate isLikelyCaptureGroup(RegExpGroup group) {
+  group.isCapture() and
+  not isInsideChoiceOrSubPattern(group)
+}
+
+/**
+ * Holds if `seq` contains two consecutive dots `..` or escaped dots.
+ *
+ * At least one of these dots is not intended to be a subdomain separator,
+ * so we avoid flagging the pattern in this case.
+ */
+predicate hasConsecutiveDots(RegExpSequence seq) {
+  exists(int i |
+    isDotLike(seq.getChild(i)) and
+    isDotLike(seq.getChild(i + 1))
+  )
+}
+
+predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
+  seq = regexp.getAChild*() and
+  exists(RegExpDot unescapedDot, int i, string hostname |
+    hasTopLevelDomainEnding(seq, i) and
+    not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
+    not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
+    unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
+    unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
+    not hasConsecutiveDots(unescapedDot.getParent()) and
+    hostname =
+      seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
+        seq.getChild(i).getRawValue()
+  |
+    if unescapedDot.getParent() instanceof RegExpQuantifier
+    then
+      // `.*\.example.com` can match `evil.com/?x=.example.com`
+      //
+      // This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
+      // We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
+      // Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
+      // and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
+      seq.getChild(0) instanceof RegExpCaret and
+      not seq.getAChild() instanceof RegExpDollar and
+      seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
+      msg =
+        "has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
+          + "' which may cause '" + hostname +
+          "' to be matched anywhere in the URL, outside the hostname."
+    else
+      msg =
+        "has an unescaped '.' before '" + hostname +
+          "', so it might match more hosts than expected."
+  )
+}
+
+predicate incompleteHostnameRegExp(
+  RegExpSequence hostSequence, string message, DataFlow::Node aux, string label
+) {
+  exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind |
+    regexp = re.getRegExpTerm() and
+    isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
+    (
+      if re.getAParse() != re
+      then (
+        kind = "string, which is used as a regular expression $@," and
+        aux = re.getAParse()
+      ) else (
+        kind = "regular expression" and aux = re
+      )
+    )
+  |
+    message = "This " + kind + " " + msg and label = "here"
+  )
+}
--- a/ruby/ql/src/queries/security/cwe-020/HostnameRegexpSpecific.qll
+++ b/ruby/ql/src/queries/security/cwe-020/HostnameRegexpSpecific.qll
@@ -0,0 +1,2 @@
+import codeql.ruby.security.performance.RegExpTreeView
+import codeql.ruby.DataFlow
--- a/ruby/ql/src/queries/security/cwe-020/IncompleteHostnameRegExp.qhelp
+++ b/ruby/ql/src/queries/security/cwe-020/IncompleteHostnameRegExp.qhelp
@@ -0,0 +1,72 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+
+	<overview>
+		<p>
+
+			Sanitizing untrusted URLs is an important technique for
+			preventing attacks such as request forgeries and malicious
+			redirections. Often, this is done by checking that the host of a URL
+			is in a set of allowed hosts.
+
+		</p>
+
+		<p>
+
+			If a regular expression implements such a check, it is
+			easy to accidentally make the check too permissive by not escaping the
+			<code>.</code> meta-characters appropriately.
+
+			Even if the check is not used in a security-critical
+			context, the incomplete check may still cause undesirable behaviors
+			when it accidentally succeeds.
+
+		</p>
+	</overview>
+
+	<recommendation>
+		<p>
+
+			Escape all meta-characters appropriately when constructing
+			regular expressions for security checks, and pay special attention to the
+			<code>.</code> meta-character.
+
+		</p>
+	</recommendation>
+
+	<example>
+
+		<p>
+
+			The following example code checks that a URL redirection
+			will reach the <code>example.com</code> domain, or one of its
+			subdomains.
+
+		</p>
+
+		<sample src="examples/IncompleteHostnameRegExp.rb"/>
+
+		<p>
+
+			The check is however easy to bypass because the unescaped
+			<code>.</code> allows for any character before
+			<code>example.com</code>, effectively allowing the redirect to go to
+			an attacker-controlled domain such as <code>wwwXexample.com</code>.
+
+		</p>
+		<p>
+
+			Address this vulnerability by escaping <code>.</code>
+			appropriately: <code>regex = /^((www|beta)\.)?example\.com/</code>.
+
+		</p>
+
+	</example>
+
+	<references>
+		<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
+		<li>OWASP: <a href="https://cheatsheetseries.owasp.org/cheatsheets/Unvalidated_Redirects_and_Forwards_Cheat_Sheet.html">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
+	</references>
+</qhelp>
--- a/ruby/ql/src/queries/security/cwe-020/IncompleteHostnameRegExp.ql
+++ b/ruby/ql/src/queries/security/cwe-020/IncompleteHostnameRegExp.ql
@@ -0,0 +1,16 @@
+/**
+ * @name Incomplete regular expression for hostnames
+ * @description Matching a URL or hostname against a regular expression that contains an unescaped dot as part of the hostname might match more hostnames than expected.
+ * @kind problem
+ * @problem.severity warning
+ * @security-severity 7.8
+ * @precision high
+ * @id rb/incomplete-hostname-regexp
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-020
+ */
+
+import HostnameRegexpShared
+
+query predicate problems = incompleteHostnameRegExp/4;
--- a/ruby/ql/src/queries/security/cwe-020/IncompleteUrlSubstringSanitization.qll
+++ b/ruby/ql/src/queries/security/cwe-020/IncompleteUrlSubstringSanitization.qll
@@ -31,7 +31,7 @@ query predicate problems(
  (
    // target contains a domain on a common TLD, and perhaps some other URL components
    target
-        .regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::commonTLD() +
+        .regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::getACommonTld() +
            "(:[0-9]+)?/?")
    or
    // target is a HTTP URL to a domain on any TLD
--- a/ruby/ql/src/queries/security/cwe-020/examples/IncompleteHostnameRegExp.rb
+++ b/ruby/ql/src/queries/security/cwe-020/examples/IncompleteHostnameRegExp.rb
@@ -0,0 +1,13 @@
+class AppController < ApplicationController
+
+    def index
+        url = params[:url]
+        host = URI(url).host
+        # BAD: the host of `url` may be controlled by an attacker
+        regex = /^((www|beta).)?example.com/
+        if host.match(regex)
+            redirect_to url
+        end
+    end
+
+end
--- a/ruby/ql/src/queries/security/cwe-079/ReflectedXSS.ql
+++ b/ruby/ql/src/queries/security/cwe-079/ReflectedXSS.ql
@@ -18,7 +18,7 @@ import codeql.ruby.security.ReflectedXSSQuery
 import codeql.ruby.DataFlow
 import DataFlow::PathGraph

-from ReflectedXSS::Configuration config, DataFlow::PathNode source, DataFlow::PathNode sink
+from ReflectedXss::Configuration config, DataFlow::PathNode source, DataFlow::PathNode sink
 where config.hasFlowPath(source, sink)
 select sink.getNode(), source, sink, "Cross-site scripting vulnerability due to $@.",
  source.getNode(), "a user-provided value"
--- a/ruby/ql/src/queries/security/cwe-079/StoredXSS.ql
+++ b/ruby/ql/src/queries/security/cwe-079/StoredXSS.ql
@@ -17,7 +17,7 @@ import codeql.ruby.security.StoredXSSQuery
 import codeql.ruby.DataFlow
 import DataFlow::PathGraph

-from StoredXSS::Configuration config, DataFlow::PathNode source, DataFlow::PathNode sink
+from StoredXss::Configuration config, DataFlow::PathNode source, DataFlow::PathNode sink
 where config.hasFlowPath(source, sink)
 select sink.getNode(), source, sink, "Cross-site scripting vulnerability due to $@",
  source.getNode(), "stored value"
--- a/ruby/ql/src/queries/security/cwe-089/SqlInjection.ql
+++ b/ruby/ql/src/queries/security/cwe-089/SqlInjection.ql
@@ -19,8 +19,8 @@ import codeql.ruby.dataflow.RemoteFlowSources
 import codeql.ruby.TaintTracking
 import DataFlow::PathGraph

-class SQLInjectionConfiguration extends TaintTracking::Configuration {
-  SQLInjectionConfiguration() { this = "SQLInjectionConfiguration" }
+class SqlInjectionConfiguration extends TaintTracking::Configuration {
+  SqlInjectionConfiguration() { this = "SQLInjectionConfiguration" }

  override predicate isSource(DataFlow::Node source) { source instanceof RemoteFlowSource }

@@ -32,7 +32,7 @@ class SQLInjectionConfiguration extends TaintTracking::Configuration {
  }
 }

-from SQLInjectionConfiguration config, DataFlow::PathNode source, DataFlow::PathNode sink
+from SqlInjectionConfiguration config, DataFlow::PathNode source, DataFlow::PathNode sink
 where config.hasFlowPath(source, sink)
 select sink.getNode(), source, sink, "This SQL query depends on $@.", source.getNode(),
  "a user-provided value"
--- a/ruby/ql/src/queries/security/cwe-116/BadTagFilter.ql
+++ b/ruby/ql/src/queries/security/cwe-116/BadTagFilter.ql
@@ -16,6 +16,6 @@

 import codeql.ruby.security.BadTagFilterQuery

-from HTMLMatchingRegExp regexp, string msg
+from HtmlMatchingRegExp regexp, string msg
 where msg = min(string m | isBadRegexpFilter(regexp, m) | m order by m.length(), m) // there might be multiple, we arbitrarily pick the shortest one
 select regexp, msg