Merge pull request #8573 from hmac/hmac/missing-regexp-anchor

Ruby: Add MissingRegExpAnchor query
This commit is contained in:
Harry Maclean
2022-04-28 11:06:33 +12:00
committed by GitHub
11 changed files with 467 additions and 5 deletions

View File

@@ -63,6 +63,9 @@ class RegExpParent extends TRegExpParent {
/** Gets the number of child terms. */
int getNumChild() { result = count(this.getAChild()) }
/** Gets the last child term of this element. */
RegExpTerm getLastChild() { result = this.getChild(this.getNumChild() - 1) }
/**
* Gets the name of a primary CodeQL class to which this regular
* expression term belongs.
@@ -578,6 +581,15 @@ class RegExpWordBoundary extends RegExpSpecialChar {
RegExpWordBoundary() { this.getChar() = "\\b" }
}
/**
* A non-word boundary, that is, a regular expression term of the form `\B`.
*/
class RegExpNonWordBoundary extends RegExpSpecialChar {
RegExpNonWordBoundary() { this.getChar() = "\\B" }
override string getAPrimaryQlClass() { result = "RegExpNonWordBoundary" }
}
/**
* A character class escape in a regular expression.
* That is, an escaped character that denotes multiple characters.
@@ -857,6 +869,21 @@ class RegExpDot extends RegExpSpecialChar {
override string getAPrimaryQlClass() { result = "RegExpDot" }
}
/**
* A term that matches a specific position between characters in the string.
*
* Example:
*
* ```
* \A
* ```
*/
class RegExpAnchor extends RegExpSpecialChar {
RegExpAnchor() { this.getChar() = ["^", "$", "\\A", "\\Z", "\\z"] }
override string getAPrimaryQlClass() { result = "RegExpAnchor" }
}
/**
* A dollar assertion `$` or `\Z` matching the end of a line.
*
@@ -866,7 +893,7 @@ class RegExpDot extends RegExpSpecialChar {
* $
* ```
*/
class RegExpDollar extends RegExpSpecialChar {
class RegExpDollar extends RegExpAnchor {
RegExpDollar() { this.getChar() = ["$", "\\Z", "\\z"] }
override string getAPrimaryQlClass() { result = "RegExpDollar" }
@@ -881,7 +908,7 @@ class RegExpDollar extends RegExpSpecialChar {
* ^
* ```
*/
class RegExpCaret extends RegExpSpecialChar {
class RegExpCaret extends RegExpAnchor {
RegExpCaret() { this.getChar() = ["^", "\\A"] }
override string getAPrimaryQlClass() { result = "RegExpCaret" }

View File

@@ -0,0 +1,4 @@
---
category: newQuery
---
* Added a new query, `rb/regex/missing-regexp-anchor`, which finds regular expressions which are improperly anchored. Validations using such expressions are at risk of being bypassed.

View File

@@ -0,0 +1,90 @@
<!DOCTYPE qhelp PUBLIC
"-//Semmle//qhelp//EN"
"qhelp.dtd">
<qhelp>
<overview>
<p>
Sanitizing untrusted input with regular expressions is a
common technique. However, it is error-prone to match untrusted input
against regular expressions without anchors such as <code>\A</code> or
<code>\z</code>. Malicious input can bypass such security checks by
embedding one of the allowed patterns in an unexpected location.
</p>
<p>
Even if the matching is not done in a security-critical
context, it may still cause undesirable behavior when the regular
expression accidentally matches.
</p>
</overview>
<recommendation>
<p>
Use anchors to ensure that regular expressions match at
the expected locations.
</p>
</recommendation>
<example>
<p>
The following example code checks that a URL redirection
will reach the <code>example.com</code> domain, or one of its
subdomains, and not some malicious site.
</p>
<sample src="examples/missing_regexp_anchor_bad.rb"/>
<p>
The check with the regular expression match is, however, easy to bypass. For example
by embedding <code>http://example.com/</code> in the query
string component: <code>http://evil-example.net/?x=http://example.com/</code>.
Address these shortcomings by using anchors in the regular expression instead:
</p>
<sample src="examples/missing_regexp_anchor_good.rb"/>
<p>
A related mistake is to write a regular expression with
multiple alternatives, but to only include an anchor for one of the
alternatives. As an example, the regular expression
<code>/^www\.example\.com|beta\.example\.com/</code> will match the host
<code>evil.beta.example.com</code> because the regular expression is parsed
as <code>/(^www\.example\.com)|(beta\.example\.com)/</code>
</p>
<p>
In Ruby the anchors <code>^</code> and <code>$</code> match the
start and end of a line, whereas the anchors <code>\A</code> and
<code>\z</code> match the start and end of the entire string.
Using line anchors can be dangerous, as this can allow malicious
input to be hidden using newlines, leading to vulnerabilities such
as HTTP header injection.
Unless you specifically need the line-matching behaviour of
<code>^</code> and <code>$</code>, you should use <code>\A</code>
and <code>\z</code> instead.
</p>
</example>
<references>
<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
<li>OWASP: <a href="https://cheatsheetseries.owasp.org/cheatsheets/Unvalidated_Redirects_and_Forwards_Cheat_Sheet.html">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
</references>
</qhelp>

View File

@@ -0,0 +1,253 @@
/**
* @name Missing regular expression anchor
* @description Regular expressions without anchors can be vulnerable to bypassing.
* @kind problem
* @problem.severity warning
* @security-severity 7.8
* @precision medium
* @id rb/regex/missing-regexp-anchor
* @tags correctness
* security
* external/cwe/cwe-020
*/
import HostnameRegexpShared
import codeql.ruby.DataFlow
import codeql.ruby.regexp.RegExpTreeView
import codeql.ruby.Regexp
/**
* Holds if `term` is a final term, that is, no term will match anything after this one.
*/
predicate isFinalRegExpTerm(RegExpTerm term) {
term.isRootTerm()
or
exists(RegExpSequence seq |
isFinalRegExpTerm(seq) and
term = seq.getLastChild()
)
or
exists(RegExpTerm parent |
isFinalRegExpTerm(parent) and
term = parent.getAChild() and
not parent instanceof RegExpSequence and
not parent instanceof RegExpQuantifier
)
}
/** Holds if `term` is one of the transitive left children of a regexp. */
predicate isLeftArmTerm(RegExpTerm term) {
term.isRootTerm()
or
exists(RegExpTerm parent |
term = parent.getChild(0) and
isLeftArmTerm(parent)
)
}
/** Holds if `term` is one of the transitive right children of a regexp. */
predicate isRightArmTerm(RegExpTerm term) {
term.isRootTerm()
or
exists(RegExpTerm parent |
term = parent.getLastChild() and
isRightArmTerm(parent)
)
}
/**
* Holds if `term` is an anchor that is not the first or last node
* in its tree.
*/
predicate isInteriorAnchor(RegExpAnchor term) {
not isLeftArmTerm(term) and
not isRightArmTerm(term)
}
/**
* Holds if `term` contains an anchor that is not the first or last node
* in its tree, such as `(foo|bar$|baz)`.
*/
predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) }
/**
* Holds if `term` starts with a word boundary or lookbehind assertion,
* indicating that it's not intended to be anchored on that side.
*/
predicate containsLeadingPseudoAnchor(RegExpSequence term) {
exists(RegExpTerm child | child = term.getChild(0) |
child instanceof RegExpWordBoundary or
child instanceof RegExpNonWordBoundary or
child instanceof RegExpLookbehind
)
}
/**
* Holds if `term` ends with a word boundary or lookahead assertion,
* indicating that it's not intended to be anchored on that side.
*/
predicate containsTrailingPseudoAnchor(RegExpSequence term) {
exists(RegExpTerm child | child = term.getLastChild() |
child instanceof RegExpWordBoundary or
child instanceof RegExpNonWordBoundary or
child instanceof RegExpLookahead
)
}
/**
* Holds if `term` is an empty sequence, usually arising from
* literals with a trailing alternative such as `foo|`.
*/
predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 }
/**
* Holds if `term` contains a letter constant.
*
* We use this as a heuristic to filter out uninteresting results.
*/
predicate containsLetters(RegExpTerm term) {
term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
}
/**
* Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz`
* and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`.
*/
predicate hasExplicitAnchorPrecedence(RegExpAlt alt) {
isAnchoredGroup(alt.getAChild()) and
not alt.getAChild() instanceof RegExpGroup
}
/**
* Holds if `term` consists only of an anchor and a parenthesized term,
* such as the left side of `^(foo|bar)|baz`.
*
* The precedence of the anchor is likely to be intentional in this case,
* as the group wouldn't be needed otherwise.
*/
predicate isAnchoredGroup(RegExpSequence term) {
term.getNumChild() = 2 and
term.getAChild() instanceof RegExpAnchor and
term.getAChild() instanceof RegExpGroup
}
/**
* Holds if `src` is a pattern for a collection of alternatives where
* only the first or last alternative is anchored, indicating a
* precedence mistake explained by `msg`.
*
* The canonical example of such a mistake is: `^a|b|c`, which is
* parsed as `(^a)|(b)|(c)`.
*/
predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) {
exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction |
root = src.getRegExpTerm() and
not containsInteriorAnchor(root) and
not isEmpty(root.getAChild()) and
not hasExplicitAnchorPrecedence(root) and
containsLetters(anchoredTerm) and
(
anchoredTerm = root.getChild(0) and
anchoredTerm.getChild(0) instanceof RegExpCaret and
not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and
containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and
direction = "beginning"
or
anchoredTerm = root.getLastChild() and
anchoredTerm.getLastChild() instanceof RegExpDollar and
not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and
containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and
direction = "end"
) and
// that is not used for string replacement
not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name |
name = mcn.getMethodName() and
arg = mcn.getArgument(0)
|
(
src.getAParse().(DataFlow::LocalSourceNode).flowsTo(arg) or
src.getAParse() = arg
) and
name = ["sub", "sub!", "gsub", "gsub!"]
) and
msg =
"Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
"' is anchored at the " + direction +
", but the other parts of this regular expression are not"
)
}
/**
* Holds if `src` contains a hostname pattern that uses the `^/$` line anchors
* rather than `\A/\z` which match the start/end of the whole string.
*/
predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
// avoid double reporting
not (
isSemiAnchoredHostnameRegExp(src, _) or
hasMisleadingAnchorPrecedence(src, _)
) and
exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
not isConstantInvalidInsideOrigin(term.getAChild*()) and
tld = term.getAChild*() and
hasTopLevelDomainEnding(tld, i) and
isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
(
tld.getChild(0).(RegExpCaret).getChar() = "^" or
tld.getLastChild().(RegExpDollar).getChar() = "$"
) and
msg =
"This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead."
)
}
/**
* Holds if `src` contains a hostname pattern that is missing a `$` anchor.
*/
predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
not isConstantInvalidInsideOrigin(term.getAChild*()) and
tld = term.getAChild*() and
hasTopLevelDomainEnding(tld, i) and
isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
tld.getChild(0) instanceof RegExpCaret and
msg =
"This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end."
)
}
/**
* Holds if `src` is an unanchored pattern for a URL, indicating a
* mistake explained by `msg`.
*/
predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() |
alwaysMatchesHostname(term) and
tld = term.getAChild*() and
hasTopLevelDomainEnding(tld) and
not isConstantInvalidInsideOrigin(term.getAChild*()) and
not term.getAChild*() instanceof RegExpAnchor and
// that is not used for string replacement
not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name |
name = mcn.getMethodName() and
arg = mcn.getArgument(0)
|
(
src.getAParse().(DataFlow::LocalSourceNode).flowsTo(arg) or
src.getAParse() = arg
) and
name = ["sub", "sub!", "gsub", "gsub!"]
) and
msg =
"When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it."
)
}
from DataFlow::Node nd, string msg
where
isUnanchoredHostnameRegExp(nd, msg) or
isSemiAnchoredHostnameRegExp(nd, msg) or
isLineAnchoredHostnameRegExp(nd, msg) or
hasMisleadingAnchorPrecedence(nd, msg)
select nd, msg

View File

@@ -0,0 +1,8 @@
class UsersController < ActionController::Base
def index
# BAD: the host of `params[:url]` may be controlled by an attacker
if params[:url].match? /https?:\/\/www\.example\.com\//
redirect_to params[:url]
end
end
end

View File

@@ -0,0 +1,8 @@
class UsersController < ActionController::Base
def index
# GOOD: the host of `params[:url]` can not be controlled by an attacker
if params[:url].match? /\Ahttps?:\/\/www\.example\.com\//
redirect_to params[:url]
end
end
end

View File

@@ -266,11 +266,11 @@ regexp.rb:
# 43| [RegExpSequence] \b!a\B
#-----| 0 -> [RegExpSpecialChar] \b
#-----| 1 -> [RegExpConstant, RegExpNormalChar] !a
#-----| 2 -> [RegExpSpecialChar] \B
#-----| 2 -> [RegExpNonWordBoundary] \B
# 43| [RegExpConstant, RegExpNormalChar] !a
# 43| [RegExpSpecialChar] \B
# 43| [RegExpNonWordBoundary] \B
# 46| [RegExpGroup] (foo)
#-----| 0 -> [RegExpConstant, RegExpNormalChar] foo

View File

@@ -108,7 +108,7 @@ term
| regexp.rb:43:2:43:3 | \\b | RegExpSpecialChar |
| regexp.rb:43:2:43:7 | \\b!a\\B | RegExpSequence |
| regexp.rb:43:4:43:5 | !a | RegExpConstant,RegExpNormalChar |
| regexp.rb:43:6:43:7 | \\B | RegExpSpecialChar |
| regexp.rb:43:6:43:7 | \\B | RegExpNonWordBoundary |
| regexp.rb:46:2:46:6 | (foo) | RegExpGroup |
| regexp.rb:46:2:46:7 | (foo)* | RegExpStar |
| regexp.rb:46:2:46:10 | (foo)*bar | RegExpSequence |

View File

@@ -0,0 +1,18 @@
| missing_regexp_anchor.rb:1:1:1:19 | /www\\.example\\.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
| missing_regexp_anchor.rb:7:1:7:22 | /https?:\\/\\/good\\.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
| missing_regexp_anchor.rb:8:1:8:23 | /^https?:\\/\\/good\\.com/ | This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end. |
| missing_regexp_anchor.rb:19:1:19:6 | /^a\|b/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:22:1:22:8 | /^a\|b\|c/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:28:1:28:8 | /^a\|(b)/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:30:1:30:10 | /^(a)\|(b)/ | Misleading operator precedence. The subexpression '^(a)' is anchored at the beginning, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:33:1:33:6 | /a\|b$/ | Misleading operator precedence. The subexpression 'b$' is anchored at the end, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:36:1:36:8 | /a\|b\|c$/ | Misleading operator precedence. The subexpression 'c$' is anchored at the end, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:42:1:42:8 | /(a)\|b$/ | Misleading operator precedence. The subexpression 'b$' is anchored at the end, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:44:1:44:10 | /(a)\|(b)$/ | Misleading operator precedence. The subexpression '(b)$' is anchored at the end, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:46:1:46:22 | /^good.com\|better.com/ | Misleading operator precedence. The subexpression '^good.com' is anchored at the beginning, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:47:1:47:24 | /^good\\.com\|better\\.com/ | Misleading operator precedence. The subexpression '^good\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:48:1:48:26 | /^good\\\\.com\|better\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:49:1:49:28 | /^good\\\\\\.com\|better\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:50:1:50:30 | /^good\\\\\\\\.com\|better\\\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression '^foo' is anchored at the beginning, but the other parts of this regular expression are not |
| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression 'baz$' is anchored at the end, but the other parts of this regular expression are not |

View File

@@ -0,0 +1 @@
queries/security/cwe-020/MissingRegExpAnchor.ql

View File

@@ -0,0 +1,53 @@
/www\.example\.com/ # BAD
/^www\.example\.com$/ # BAD: uses end-of-line anchors rather than end-of-string anchors
/\Awww\.example\.com\z/ # GOOD
/foo\.bar/ # GOOD
/https?:\/\/good\.com/ # BAD
/^https?:\/\/good\.com/ # BAD: missing end-of-string anchor
/(^https?:\/\/good1\.com)|(^https?:#good2\.com)/ # BAD: missing end-of-string anchor
/bar/ # GOOD
foo.gsub(/www\.example\.com/, "bar") # GOOD
foo.sub(/www\.example.com/, "bar") # GOOD
foo.gsub!(/www\.example\.com/, "bar") # GOOD
foo.sub!(/www\.example\.com/, "bar") # GOOD
/^a|/
/^a|b/ # BAD
/a|^b/
/^a|^b/
/^a|b|c/ # BAD
/a|^b|c/
/a|b|^c/
/^a|^b|c/
/(^a)|b/
/^a|(b)/ # BAD
/^a|(^b)/
/^(a)|(b)/ # BAD
/a|b$/ # BAD
/a$|b/
/a$|b$/
/a|b|c$/ # BAD
/a|b$|c/
/a$|b|c/
/a|b$|c$/
/a|(b$)/
/(a)|b$/ # BAD
/(a$)|b$/
/(a)|(b)$/ # BAD
/^good.com|better.com/ # BAD
/^good\.com|better\.com/ # BAD
/^good\\.com|better\\.com/ # BAD
/^good\\\.com|better\\\.com/ # BAD
/^good\\\\.com|better\\\\.com/ # BAD
/^foo|bar|baz$/ # BAD
/^foo|%/ # OK