Ruby: improve tracking of regular expressions

There are two flavours of `match?`. If the receiver of `match?` has type String
then the argument to `match?` is a regular expression. However, if the receiver of
`match?` has type Regexp then the argument is the text.

The role of receiver and argument flips depending on the type of the receiver, this
caused a lot of false positives when looking for string-like literals that are
used as a regular expression.

This commit attempts to improve things by trying to determine whether the type of the
receiver is known to be of type Regexp. In such cases we know that the argument
is unlikely to be  regular expression.
This commit is contained in:
Arthur Baars
2022-09-29 11:33:35 +02:00
parent 0160c374e4
commit 44cc6f7350
4 changed files with 33 additions and 7 deletions

View File

@@ -114,7 +114,7 @@ class StdLibRegExpInterpretation extends RegExpInterpretation::Range {
mce.getMethodName() = ["match", "match?"] and
this = mce.getArgument(0) and
// exclude https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match
not mce.getReceiver().asExpr().getExpr() instanceof Ast::RegExpLiteral
not mce.getReceiver() = trackRegexpType()
)
}
}

View File

@@ -3,6 +3,8 @@ private import codeql.ruby.ast.Literal as Ast
private import codeql.ruby.DataFlow
private import codeql.ruby.controlflow.CfgNodes
private import codeql.ruby.dataflow.internal.tainttrackingforlibraries.TaintTrackingImpl
private import codeql.ruby.typetracking.TypeTracker
private import codeql.ruby.ApiGraphs
class RegExpConfiguration extends Configuration {
RegExpConfiguration() { this = "RegExpConfiguration" }
@@ -19,12 +21,26 @@ class RegExpConfiguration extends Configuration {
override predicate isSink(DataFlow::Node sink) { sink instanceof RegExpInterpretation::Range }
override predicate isSanitizer(DataFlow::Node node) {
// stop flow if `node` is receiver of
// https://ruby-doc.org/core-2.4.0/String.html#method-i-match
exists(DataFlow::CallNode mce |
mce.getMethodName() = ["match", "match?"] and
exists(DataFlow::CallNode mce | mce.getMethodName() = ["match", "match?"] |
// receiver of https://ruby-doc.org/core-2.4.0/String.html#method-i-match
node = mce.getReceiver() and
mce.getArgument(0).asExpr().getExpr() instanceof Ast::RegExpLiteral
mce.getArgument(0) = trackRegexpType()
or
// first argument of https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match
node = mce.getArgument(0) and
mce.getReceiver() = trackRegexpType()
)
}
}
private DataFlow::LocalSourceNode trackRegexpType(TypeTracker t) {
t.start() and
(
result.asExpr().getExpr() instanceof Ast::RegExpLiteral or
result = API::getTopLevelMember("Regexp").getAMethodCall(["compile", "new"])
)
or
exists(TypeTracker t2 | result = trackRegexpType(t2).track(t2, t))
}
DataFlow::Node trackRegexpType() { trackRegexpType(TypeTracker::end()).flowsTo(result) }

View File

@@ -16,3 +16,5 @@
| missing_regexp_anchor.rb:50:1:50:30 | /^good\\\\\\\\.com\|better\\\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression '^foo' is anchored at the beginning, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression 'baz$' is anchored at the end, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:60:20:60:39 | "http://example.com" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
| missing_regexp_anchor.rb:61:19:61:38 | "http://example.com" | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |

View File

@@ -50,4 +50,12 @@ foo.sub!(/www\.example\.com/, "bar") # GOOD
/^good\\\\.com|better\\\\.com/ # BAD
/^foo|bar|baz$/ # BAD
/^foo|%/ # OK
/^foo|%/ # OK
REGEXP = /foo/
REGEXP.match? "http://example.com" # GOOD: the url is the text not the regexp
REGEXP.match "http://example.com" # GOOD: the url is the text not the regexp
"http://example.com".match? REGEXP # GOOD: the url is the text not the regexp
"http://example.com".match REGEXP # GOOD: the url is the text not the regexp
"some text".match? "http://example.com" # BAD
"some text".match "http://example.com" # BAD