diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp
index cb2cddb6573..eac01730532 100644
--- a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp
+++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp
@@ -65,8 +65,6 @@
evil.beta.example.com because the regular expression is parsed
as /(^www\.example\.com)|(beta\.example\.com)/
- TODO: implement this part of the query
-
diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql index 3c67b83ac61..d06ef7efa9b 100644 --- a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql +++ b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql @@ -34,12 +34,158 @@ predicate isFinalRegExpTerm(RegExpTerm term) { ) } +/** Holds if `term` is one of the transitive left children of a regexp. */ +predicate isLeftArmTerm(RegExpTerm term) { + term.isRootTerm() + or + exists(RegExpTerm parent | + term = parent.getChild(0) and + isLeftArmTerm(parent) + ) +} + +/** Holds if `term` is one of the transitive right children of a regexp. */ +predicate isRightArmTerm(RegExpTerm term) { + term.isRootTerm() + or + exists(RegExpTerm parent | + term = parent.getLastChild() and + isRightArmTerm(parent) + ) +} + +/** + * Holds if `term` is an anchor that is not the first or last node + * in its tree. + */ +predicate isInteriorAnchor(RegExpAnchor term) { + not isLeftArmTerm(term) and + not isRightArmTerm(term) +} + +/** + * Holds if `term` contains an anchor that is not the first or last node + * in its tree, such as `(foo|bar$|baz)`. + */ +predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) } + +/** + * Holds if `term` starts with a word boundary or lookbehind assertion, + * indicating that it's not intended to be anchored on that side. + */ +predicate containsLeadingPseudoAnchor(RegExpSequence term) { + exists(RegExpTerm child | child = term.getChild(0) | + child instanceof RegExpWordBoundary or + child instanceof RegExpNonWordBoundary or + child instanceof RegExpLookbehind + ) +} + +/** + * Holds if `term` ends with a word boundary or lookahead assertion, + * indicating that it's not intended to be anchored on that side. + */ +predicate containsTrailingPseudoAnchor(RegExpSequence term) { + exists(RegExpTerm child | child = term.getLastChild() | + child instanceof RegExpWordBoundary or + child instanceof RegExpNonWordBoundary or + child instanceof RegExpLookahead + ) +} + +/** + * Holds if `term` is an empty sequence, usually arising from + * literals with a trailing alternative such as `foo|`. + */ +predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 } + +/** + * Holds if `term` contains a letter constant. + * + * We use this as a heuristic to filter out uninteresting results. + */ +predicate containsLetters(RegExpTerm term) { + term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*") +} + +/** + * Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz` + * and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`. + */ +predicate hasExplicitAnchorPrecedence(RegExpAlt alt) { + isAnchoredGroup(alt.getAChild()) and + not alt.getAChild() instanceof RegExpGroup +} + +/** + * Holds if `term` consists only of an anchor and a parenthesized term, + * such as the left side of `^(foo|bar)|baz`. + * + * The precedence of the anchor is likely to be intentional in this case, + * as the group wouldn't be needed otherwise. + */ +predicate isAnchoredGroup(RegExpSequence term) { + term.getNumChild() = 2 and + term.getAChild() instanceof RegExpAnchor and + term.getAChild() instanceof RegExpGroup +} + +/** + * Holds if `src` is a pattern for a collection of alternatives where + * only the first or last alternative is anchored, indicating a + * precedence mistake explained by `msg`. + * + * The canonical example of such a mistake is: `^a|b|c`, which is + * parsed as `(^a)|(b)|(c)`. + */ +predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) { + exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction | + root = src.getRegExpTerm() and + not containsInteriorAnchor(root) and + not isEmpty(root.getAChild()) and + not hasExplicitAnchorPrecedence(root) and + containsLetters(anchoredTerm) and + ( + anchoredTerm = root.getChild(0) and + anchoredTerm.getChild(0) instanceof RegExpCaret and + not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and + containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and + direction = "beginning" + or + anchoredTerm = root.getLastChild() and + anchoredTerm.getLastChild() instanceof RegExpDollar and + not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and + containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and + direction = "end" + ) and + // that is not used for string replacement + not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name | + name = mcn.getMethodName() and + arg = mcn.getArgument(0) + | + ( + src.getAParse().(DataFlow::LocalSourceNode).flowsTo(arg) or + src.getAParse() = arg + ) and + name = ["sub", "sub!", "gsub", "gsub!"] + ) and + msg = + "Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() + + "' is anchored at the " + direction + + ", but the other parts of this regular expression are not" + ) +} + /** * Holds if `src` contains a hostname pattern that uses the `^/$` line anchors * rather than `\A/\z` which match the start/end of the whole string. */ predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) { - not isSemiAnchoredHostnameRegExp(src, msg) and // avoid double reporting + // avoid double reporting + not ( + isSemiAnchoredHostnameRegExp(src, _) or + hasMisleadingAnchorPrecedence(src, _) + ) and exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() | not isConstantInvalidInsideOrigin(term.getAChild*()) and tld = term.getAChild*() and @@ -58,7 +204,7 @@ predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) { * Holds if `src` contains a hostname pattern that is missing a `$` anchor. */ predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) { - // not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting + not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() | not isConstantInvalidInsideOrigin(term.getAChild*()) and tld = term.getAChild*() and @@ -81,7 +227,7 @@ predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) { hasTopLevelDomainEnding(tld) and not isConstantInvalidInsideOrigin(term.getAChild*()) and not term.getAChild*() instanceof RegExpAnchor and - // that is not used for capture or replace + // that is not used for string replacement not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name | name = mcn.getMethodName() and arg = mcn.getArgument(0) @@ -101,5 +247,6 @@ from DataFlow::Node nd, string msg where isUnanchoredHostnameRegExp(nd, msg) or isSemiAnchoredHostnameRegExp(nd, msg) or - isLineAnchoredHostnameRegExp(nd, msg) + isLineAnchoredHostnameRegExp(nd, msg) or + hasMisleadingAnchorPrecedence(nd, msg) select nd, msg diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected index c4825104b7e..9f5a8b11fc6 100644 --- a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected +++ b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected @@ -1,4 +1,18 @@ -| missing_regexp_anchor.rb:1:1:1:17 | /www.example.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. | -| missing_regexp_anchor.rb:7:1:7:21 | /https?:\\/\\/good.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. | -| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end. | -| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead. | +| missing_regexp_anchor.rb:1:1:1:19 | /www\\.example\\.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. | +| missing_regexp_anchor.rb:7:1:7:22 | /https?:\\/\\/good\\.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. | +| missing_regexp_anchor.rb:8:1:8:23 | /^https?:\\/\\/good\\.com/ | This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end. | +| missing_regexp_anchor.rb:19:1:19:6 | /^a\|b/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:22:1:22:8 | /^a\|b\|c/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:28:1:28:8 | /^a\|(b)/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:30:1:30:10 | /^(a)\|(b)/ | Misleading operator precedence. The subexpression '^(a)' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:33:1:33:6 | /a\|b$/ | Misleading operator precedence. The subexpression 'b$' is anchored at the end, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:36:1:36:8 | /a\|b\|c$/ | Misleading operator precedence. The subexpression 'c$' is anchored at the end, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:42:1:42:8 | /(a)\|b$/ | Misleading operator precedence. The subexpression 'b$' is anchored at the end, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:44:1:44:10 | /(a)\|(b)$/ | Misleading operator precedence. The subexpression '(b)$' is anchored at the end, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:46:1:46:22 | /^good.com\|better.com/ | Misleading operator precedence. The subexpression '^good.com' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:47:1:47:24 | /^good\\.com\|better\\.com/ | Misleading operator precedence. The subexpression '^good\\.com' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:48:1:48:26 | /^good\\\\.com\|better\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:49:1:49:28 | /^good\\\\\\.com\|better\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:50:1:50:30 | /^good\\\\\\\\.com\|better\\\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression '^foo' is anchored at the beginning, but the other parts of this regular expression are not | +| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression 'baz$' is anchored at the end, but the other parts of this regular expression are not | diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb index e7c9ae6a661..646a3574447 100644 --- a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb +++ b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb @@ -1,18 +1,53 @@ -/www.example.com/ # BAD -/^www.example.com$/ # BAD: uses end-of-line anchors rather than end-of-string anchors -/\Awww.example.com\z/ # GOOD +/www\.example\.com/ # BAD +/^www\.example\.com$/ # BAD: uses end-of-line anchors rather than end-of-string anchors +/\Awww\.example\.com\z/ # GOOD -/foo.bar/ # GOOD +/foo\.bar/ # GOOD -/https?:\/\/good.com/ # BAD -/^https?:\/\/good.com/ # BAD: missing end-of-string anchor -/(^https?:\/\/good1.com)|(^https?://good2.com)/ # BAD: missing end-of-string anchor +/https?:\/\/good\.com/ # BAD +/^https?:\/\/good\.com/ # BAD: missing end-of-string anchor +/(^https?:\/\/good1\.com)|(^https?:#good2\.com)/ # BAD: missing end-of-string anchor /bar/ # GOOD -foo.gsub(/www.example.com/, "bar") # GOOD -foo.sub(/www.example.com/, "bar") # GOOD -foo.gsub!(/www.example.com/, "bar") # GOOD -foo.sub!(/www.example.com/, "bar") # GOOD +foo.gsub(/www\.example\.com/, "bar") # GOOD +foo.sub(/www\.example.com/, "bar") # GOOD +foo.gsub!(/www\.example\.com/, "bar") # GOOD +foo.sub!(/www\.example\.com/, "bar") # GOOD + +/^a|/ +/^a|b/ # BAD +/a|^b/ +/^a|^b/ +/^a|b|c/ # BAD +/a|^b|c/ +/a|b|^c/ +/^a|^b|c/ + +/(^a)|b/ +/^a|(b)/ # BAD +/^a|(^b)/ +/^(a)|(b)/ # BAD +/a|b$/ # BAD +/a$|b/ +/a$|b$/ +/a|b|c$/ # BAD +/a|b$|c/ +/a$|b|c/ +/a|b$|c$/ + +/a|(b$)/ +/(a)|b$/ # BAD +/(a$)|b$/ +/(a)|(b)$/ # BAD + +/^good.com|better.com/ # BAD +/^good\.com|better\.com/ # BAD +/^good\\.com|better\\.com/ # BAD +/^good\\\.com|better\\\.com/ # BAD +/^good\\\\.com|better\\\\.com/ # BAD + +/^foo|bar|baz$/ # BAD +/^foo|%/ # OK \ No newline at end of file