Ruby: improve tracking of regular expressions

There are two flavours of `match?`. If the receiver of `match?` has type String then the argument to `match?` is a regular expression. However, if the receiver of `match?` has type Regexp then the argument is the text. The role of receiver and argument flips depending on the type of the receiver, this caused a lot of false positives when looking for string-like literals that are used as a regular expression. This commit attempts to improve things by trying to determine whether the type of the receiver is known to be of type Regexp. In such cases we know that the argument is unlikely to be regular expression.
2026-04-30 11:15:13 +02:00 · 2022-09-29 11:33:35 +02:00
parent 0160c374e4
commit 44cc6f7350
4 changed files with 33 additions and 7 deletions
--- a/ruby/ql/lib/codeql/ruby/Regexp.qll
+++ b/ruby/ql/lib/codeql/ruby/Regexp.qll
@@ -114,7 +114,7 @@ class StdLibRegExpInterpretation extends RegExpInterpretation::Range {
      mce.getMethodName() = ["match", "match?"] and
      this = mce.getArgument(0) and
      // exclude https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match
-      not mce.getReceiver().asExpr().getExpr() instanceof Ast::RegExpLiteral
+      not mce.getReceiver() = trackRegexpType()
    )
  }
 }
--- a/ruby/ql/lib/codeql/ruby/regexp/internal/RegExpConfiguration.qll
+++ b/ruby/ql/lib/codeql/ruby/regexp/internal/RegExpConfiguration.qll
@@ -3,6 +3,8 @@ private import codeql.ruby.ast.Literal as Ast
 private import codeql.ruby.DataFlow
 private import codeql.ruby.controlflow.CfgNodes
 private import codeql.ruby.dataflow.internal.tainttrackingforlibraries.TaintTrackingImpl
+private import codeql.ruby.typetracking.TypeTracker
+private import codeql.ruby.ApiGraphs

 class RegExpConfiguration extends Configuration {
  RegExpConfiguration() { this = "RegExpConfiguration" }
@@ -19,12 +21,26 @@ class RegExpConfiguration extends Configuration {
  override predicate isSink(DataFlow::Node sink) { sink instanceof RegExpInterpretation::Range }

  override predicate isSanitizer(DataFlow::Node node) {
-    // stop flow if `node` is receiver of
-    // https://ruby-doc.org/core-2.4.0/String.html#method-i-match
-    exists(DataFlow::CallNode mce |
-      mce.getMethodName() = ["match", "match?"] and
+    exists(DataFlow::CallNode mce | mce.getMethodName() = ["match", "match?"] |
+      // receiver of https://ruby-doc.org/core-2.4.0/String.html#method-i-match
      node = mce.getReceiver() and
-      mce.getArgument(0).asExpr().getExpr() instanceof Ast::RegExpLiteral
+      mce.getArgument(0) = trackRegexpType()
+      or
+      // first argument of https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match
+      node = mce.getArgument(0) and
+      mce.getReceiver() = trackRegexpType()
    )
  }
 }
+
+private DataFlow::LocalSourceNode trackRegexpType(TypeTracker t) {
+  t.start() and
+  (
+    result.asExpr().getExpr() instanceof Ast::RegExpLiteral or
+    result = API::getTopLevelMember("Regexp").getAMethodCall(["compile", "new"])
+  )
+  or
+  exists(TypeTracker t2 | result = trackRegexpType(t2).track(t2, t))
+}
+
+DataFlow::Node trackRegexpType() { trackRegexpType(TypeTracker::end()).flowsTo(result) }
--- a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected
+++ b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected
@@ -16,3 +16,5 @@
 | missing_regexp_anchor.rb:50:1:50:30 | /^good\\\\\\\\.com\|better\\\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
 | missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression '^foo' is anchored at the beginning, but the other parts of this regular expression are not |
 | missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression 'baz$' is anchored at the end, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:60:20:60:39 | "http://example.com" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| missing_regexp_anchor.rb:61:19:61:38 | "http://example.com" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
--- a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb
+++ b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb
@@ -50,4 +50,12 @@ foo.sub!(/www\.example\.com/, "bar") # GOOD
 /^good\\\\.com|better\\\\.com/ # BAD

 /^foo|bar|baz$/ # BAD
-/^foo|%/ # OK
+/^foo|%/ # OK
+
+REGEXP = /foo/
+REGEXP.match? "http://example.com" # GOOD: the url is the text not the regexp
+REGEXP.match "http://example.com" # GOOD: the url is the text not the regexp
+"http://example.com".match? REGEXP  # GOOD: the url is the text not the regexp
+"http://example.com".match REGEXP  # GOOD: the url is the text not the regexp
+"some text".match? "http://example.com" # BAD
+"some text".match "http://example.com" # BAD