share {js,rb}/regex/missing-regexp-anchor

This commit is contained in:
erik-krogh
2022-12-15 21:52:02 +01:00
parent 355499ea52
commit 26c5480ee6
10 changed files with 346 additions and 384 deletions

View File

@@ -366,6 +366,9 @@ class RegExpAnchor extends RegExpTerm, @regexp_anchor {
override predicate isNullable() { any() }
override string getAPrimaryQlClass() { result = "RegExpAnchor" }
/** Gets the char for this term. */
abstract string getChar();
}
/**
@@ -379,6 +382,8 @@ class RegExpAnchor extends RegExpTerm, @regexp_anchor {
*/
class RegExpCaret extends RegExpAnchor, @regexp_caret {
override string getAPrimaryQlClass() { result = "RegExpCaret" }
override string getChar() { result = "^" }
}
/**
@@ -392,6 +397,8 @@ class RegExpCaret extends RegExpAnchor, @regexp_caret {
*/
class RegExpDollar extends RegExpAnchor, @regexp_dollar {
override string getAPrimaryQlClass() { result = "RegExpDollar" }
override string getChar() { result = "$" }
}
/**

View File

@@ -8,7 +8,8 @@ private import semmle.javascript.security.regexp.RegExpTreeView::RegExpTreeView
private import semmle.javascript.Regexp as RegExp
private import codeql.regex.HostnameRegexp as Shared
private module Impl implements Shared::HostnameRegexpSig<TreeImpl> {
/** An implementation of the signature that allows the Hostname analysis to run. */
module Impl implements Shared::HostnameRegexpSig<TreeImpl> {
class DataFlowNode = JS::DataFlow::Node;
class RegExpPatternSource = RegExp::RegExpPatternSource;

View File

@@ -12,194 +12,38 @@
*/
private import javascript
private import semmle.javascript.security.regexp.HostnameRegexp
private import semmle.javascript.security.regexp.HostnameRegexp as HostnameRegexp
private import codeql.regex.MissingRegExpAnchor as MissingRegExpAnchor
private import semmle.javascript.security.regexp.RegExpTreeView::RegExpTreeView as TreeImpl
// TODO: Share the below code.
/**
* Holds if `term` is an anchor that is not the first or last node
* in its tree.
*/
predicate isInteriorAnchor(RegExpAnchor term) {
not isLeftArmTerm(term) and
not isRightArmTerm(term)
}
/**
* Holds if `term` contains an anchor that is not the first or last node
* in its tree, such as `(foo|bar$|baz)`.
*/
predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) }
/**
* Holds if `term` starts with a word boundary or lookbehind assertion,
* indicating that it's not intended to be anchored on that side.
*/
predicate containsLeadingPseudoAnchor(RegExpSequence term) {
exists(RegExpTerm child | child = term.getChild(0) |
child instanceof RegExpWordBoundary or
child instanceof RegExpNonWordBoundary or
child instanceof RegExpLookbehind
)
}
/**
* Holds if `term` ends with a word boundary or lookahead assertion,
* indicating that it's not intended to be anchored on that side.
*/
predicate containsTrailingPseudoAnchor(RegExpSequence term) {
exists(RegExpTerm child | child = term.getLastChild() |
child instanceof RegExpWordBoundary or
child instanceof RegExpNonWordBoundary or
child instanceof RegExpLookahead
)
}
/**
* Holds if `term` is an empty sequence, usually arising from
* literals with a trailing alternative such as `foo|`.
*/
predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 }
/**
* Holds if `term` contains a letter constant.
*
* We use this as a heuristic to filter out uninteresting results.
*/
predicate containsLetters(RegExpTerm term) {
term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
}
/**
* Holds if `term` consists only of an anchor and a parenthesized term,
* such as the left side of `^(foo|bar)|baz`.
*
* The precedence of the anchor is likely to be intentional in this case,
* as the group wouldn't be needed otherwise.
*/
predicate isAnchoredGroup(RegExpSequence term) {
term.getNumChild() = 2 and
term.getAChild() instanceof RegExpAnchor and
term.getAChild() instanceof RegExpGroup
}
/**
* Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz`
* and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`.
*/
predicate hasExplicitAnchorPrecedence(RegExpAlt alt) {
isAnchoredGroup(alt.getAChild()) and
not alt.getAChild() instanceof RegExpGroup
}
/**
* Holds if `src` is a pattern for a collection of alternatives where
* only the first or last alternative is anchored, indicating a
* precedence mistake explained by `msg`.
*
* The canonical example of such a mistake is: `^a|b|c`, which is
* parsed as `(^a)|(b)|(c)`.
*/
predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) {
exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction |
root = src.getRegExpTerm() and
not containsInteriorAnchor(root) and
not isEmpty(root.getAChild()) and
not hasExplicitAnchorPrecedence(root) and
containsLetters(anchoredTerm) and
(
anchoredTerm = root.getChild(0) and
anchoredTerm.getChild(0) instanceof RegExpCaret and
not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and
containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and
direction = "beginning"
or
anchoredTerm = root.getLastChild() and
anchoredTerm.getLastChild() instanceof RegExpDollar and
not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and
containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and
direction = "end"
) and
// is not used for replace
not exists(DataFlow::MethodCallNode replace |
replace.getMethodName() = "replace" and
src.getARegExpObject().flowsTo(replace.getArgument(0))
) and
msg =
"Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
"' is anchored at the " + direction +
", but the other parts of this regular expression are not"
)
}
/**
* Holds if `term` is a final term, that is, no term will match anything after this one.
*/
predicate isFinalRegExpTerm(RegExpTerm term) {
term.isRootTerm()
or
exists(RegExpSequence seq |
isFinalRegExpTerm(seq) and
term = seq.getLastChild()
)
or
exists(RegExpTerm parent |
isFinalRegExpTerm(parent) and
term = parent.getAChild() and
not parent instanceof RegExpSequence and
not parent instanceof RegExpQuantifier
)
}
/**
* Holds if `src` contains a hostname pattern that is missing a `$` anchor.
*/
predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
not isConstantInvalidInsideOrigin(term.getAChild*()) and
tld = term.getAChild*() and
hasTopLevelDomainEnding(tld, i) and
isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
tld.getChild(0) instanceof RegExpCaret and
msg =
"This hostname pattern may match any domain name, as it is missing a '$' or '/' at the end."
)
}
/**
* Holds if `src` is an unanchored pattern for a URL, indicating a
* mistake explained by `msg`.
*/
predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() |
alwaysMatchesHostname(term) and
tld = term.getAChild*() and
hasTopLevelDomainEnding(tld) and
not isConstantInvalidInsideOrigin(term.getAChild*()) and
not term.getAChild*() instanceof RegExpAnchor and
// that is not used for capture or replace
not exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() |
private module Impl implements
MissingRegExpAnchor::MissingRegExpAnchorSig<TreeImpl, HostnameRegexp::Impl> {
predicate isUsedAsReplace(RegExpPatternSource pattern) {
// is used for capture or replace
exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() |
name = "exec" and
mcn = src.getARegExpObject().getAMethodCall() and
mcn = pattern.getARegExpObject().getAMethodCall() and
exists(mcn.getAPropertyRead())
or
exists(DataFlow::Node arg |
arg = mcn.getArgument(0) and
(
src.getARegExpObject().flowsTo(arg) or
src.getAParse() = arg
pattern.getARegExpObject().flowsTo(arg) or
pattern.getAParse() = arg
)
|
name = "replace"
or
name = "match" and exists(mcn.getAPropertyRead())
)
) and
msg =
"When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it."
)
)
}
string getEndAnchorText() { result = "$" }
}
import MissingRegExpAnchor::Make<TreeImpl, HostnameRegexp::Impl, Impl>
from DataFlow::Node nd, string msg
where
isUnanchoredHostnameRegExp(nd, msg)
@@ -207,4 +51,5 @@ where
isSemiAnchoredHostnameRegExp(nd, msg)
or
hasMisleadingAnchorPrecedence(nd, msg)
// isLineAnchoredHostnameRegExp is not used here, as it is not relevant to JS.
select nd, msg