JS: Add missing post-anchor case to MissingRegExpAnchor

This commit is contained in:
Asger F
2019-10-24 16:04:52 +01:00
parent 17ad97812e
commit 8c5b9b9195
6 changed files with 101 additions and 7 deletions

View File

@@ -67,3 +67,38 @@ predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
predicate hasTopLevelDomainEnding(RegExpSequence seq) {
hasTopLevelDomainEnding(seq, _)
}
/**
* Holds if `term` will always match a hostname, that is, all disjunctions contain
* a hostname pattern that isn't inside a quantifier.
*/
predicate alwaysMatchesHostname(RegExpTerm term) {
hasTopLevelDomainEnding(term, _)
or
// `localhost` is considered a hostname pattern, but has no TLD
term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b")
or
not term instanceof RegExpAlt and
not term instanceof RegExpQuantifier and
alwaysMatchesHostname(term.getAChild())
or
alwaysMatchesHostnameAlt(term)
}
/** Holds if every child of `alt` contains a hostname pattern. */
predicate alwaysMatchesHostnameAlt(RegExpAlt alt) {
alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1)
}
/**
* Holds if the first `i` children of `alt` contains a hostname pattern.
*
* This is used instead of `forall` to avoid materializing the set of alternatives
* that don't contains hostnames, which is much larger.
*/
predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
alwaysMatchesHostname(alt.getChild(0)) and i = 0
or
alwaysMatchesHostnameAlt(alt, i - 1) and
alwaysMatchesHostname(alt.getChild(i))
}

View File

@@ -121,7 +121,7 @@ predicate hasExplicitAnchorPrecedence(RegExpAlt alt) {
* The canonical example of such a mistake is: `^a|b|c`, which is
* parsed as `(^a)|(b)|(c)`.
*/
predicate isInterestingSemiAnchoredRegExpString(RegExpPatternSource src, string msg) {
predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) {
exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction |
root = src.getRegExpTerm() and
not containsInteriorAnchor(root) and
@@ -151,12 +151,47 @@ predicate isInterestingSemiAnchoredRegExpString(RegExpPatternSource src, string
)
}
/**
* Holds if `term` is a final term, that is, no term will match anything after this one.
*/
predicate isFinalRegExpTerm(RegExpTerm term) {
term.isRootTerm()
or
exists(RegExpSequence seq |
isFinalRegExpTerm(seq) and
term = seq.getLastChild()
)
or
exists(RegExpTerm parent |
isFinalRegExpTerm(parent) and
term = parent.getAChild() and
not parent instanceof RegExpSequence and
not parent instanceof RegExpQuantifier
)
}
/**
* Holds if `src` contains a hostname pattern that is missing a `$` anchor.
*/
predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
not isConstantInvalidInsideOrigin(term.getAChild*()) and
tld = term.getAChild*() and
hasTopLevelDomainEnding(tld, i) and
isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
tld.getChild(0) instanceof RegExpCaret and
msg = "This hostname pattern may match any domain name, as it is missing a '$' or '/' at the end."
)
}
/**
* Holds if `src` is an unanchored pattern for a URL, indicating a
* mistake explained by `msg`.
*/
predicate isInterestingUnanchoredRegExpString(RegExpPatternSource src, string msg) {
predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() |
alwaysMatchesHostname(term) and
tld = term.getAChild*() and
hasTopLevelDomainEnding(tld) and
not isConstantInvalidInsideOrigin(term.getAChild*()) and
@@ -185,7 +220,9 @@ predicate isInterestingUnanchoredRegExpString(RegExpPatternSource src, string ms
from DataFlow::Node nd, string msg
where
isInterestingUnanchoredRegExpString(nd, msg)
isUnanchoredHostnameRegExp(nd, msg)
or
isInterestingSemiAnchoredRegExpString(nd, msg)
isSemiAnchoredHostnameRegExp(nd, msg)
or
hasMisleadingAnchorPrecedence(nd, msg)
select nd, msg