JS: Update semi-anchored regex query

This commit is contained in:
Asger F
2019-10-16 22:15:57 +01:00
parent c21d095d38
commit 8bc89ee254
5 changed files with 138 additions and 37 deletions

View File

@@ -12,6 +12,84 @@
import javascript
/** Holds if `term` is one of the transitive left children of a regexp. */
predicate isLeftArmTerm(RegExpTerm term) {
term.isRootTerm()
or
exists(RegExpTerm parent |
term = parent.getChild(0) and
isLeftArmTerm(parent)
)
}
/** Holds if `term` is one of the transitive right children of a regexp. */
predicate isRightArmTerm(RegExpTerm term) {
term.isRootTerm()
or
exists(RegExpTerm parent |
term = parent.getLastChild() and
isRightArmTerm(parent)
)
}
/**
* Holds if `term` is an anchor that is not the first or last node
* in its tree.
*/
predicate isInteriorAnchor(RegExpAnchor term) {
not isLeftArmTerm(term) and
not isRightArmTerm(term)
}
/**
* Holds if `term` contains an anchor that is not the first or last node
* in its tree, such as `(foo|bar$|baz)`.
*/
predicate containsInteriorAnchor(RegExpTerm term) {
isInteriorAnchor(term.getAChild*())
}
/**
* Holds if `term` starts with a word boundary or lookbehind assertion,
* indicating that it's not intended to be anchored on that side.
*/
predicate containsLeadingPseudoAnchor(RegExpSequence term) {
exists(RegExpTerm child | child = term.getChild(0) |
child instanceof RegExpWordBoundary or
child instanceof RegExpNonWordBoundary or
child instanceof RegExpLookbehind
)
}
/**
* Holds if `term` ends with a word boundary or lookahead assertion,
* indicating that it's not intended to be anchored on that side.
*/
predicate containsTrailingPseudoAnchor(RegExpSequence term) {
exists(RegExpTerm child | child = term.getLastChild() |
child instanceof RegExpWordBoundary or
child instanceof RegExpNonWordBoundary or
child instanceof RegExpLookahead
)
}
/**
* Holds if `term` is an empty sequence, usually arising from
* literals with a trailing alternative such as `foo|`.
*/
predicate isEmpty(RegExpSequence term) {
term.getNumChild() = 0
}
/**
* Holds if `term` contains a letter constant.
*
* We use this as a heuristic to filter out uninteresting results.
*/
predicate containsLetters(RegExpTerm term) {
term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
}
/**
* Holds if `src` is a pattern for a collection of alternatives where
* only the first or last alternative is anchored, indicating a
@@ -21,23 +99,21 @@ import javascript
* parsed as `(^a)|(b)|(c)`.
*/
predicate isInterestingSemiAnchoredRegExpString(RegExpPatternSource src, string msg) {
exists(string str, string maybeGroupedStr, string regex, string anchorPart, string escapedDot |
// a dot that might be escaped in a regular expression, for example `/\./` or `new RegExp('\\.')`
escapedDot = "\\\\[.]" and
// a string that is mostly free from special reqular expression symbols
str = "(?:(?:" + escapedDot + ")|[a-z:/.?_,@0-9 -])+" and
// the string may be wrapped in parentheses
maybeGroupedStr = "(?:" + str + "|\\(" + str + "\\))" and
exists(RegExpAlt root, RegExpSequence anchoredTerm |
root = src.getRegExpTerm() and
not containsInteriorAnchor(root) and
not isEmpty(root.getAChild()) and
containsLetters(anchoredTerm) and
(
// a problematic pattern: `^a|b|...|x`
regex = "(?i)(\\^" + maybeGroupedStr + ")(?:\\|" + maybeGroupedStr + ")+"
anchoredTerm = root.getChild(0) and
anchoredTerm.getChild(0) instanceof RegExpCaret and
not containsLeadingPseudoAnchor(root.getChild([ 1 .. root.getNumChild() - 1 ]))
or
// a problematic pattern: `a|b|...|x$`
regex = "(?i)(?:" + maybeGroupedStr + "\\|)+(" + maybeGroupedStr + "\\$)"
anchoredTerm = root.getLastChild() and
anchoredTerm.getLastChild() instanceof RegExpDollar and
not containsTrailingPseudoAnchor(root.getChild([ 0 .. root.getNumChild() - 2 ]))
) and
anchorPart = src.getPattern().regexpCapture(regex, 1) and
anchorPart.regexpMatch("(?i).*[a-z].*") and
msg = "Misleading operator precedence. The subexpression '" + anchorPart +
msg = "Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
"' is anchored, but the other parts of this regular expression are not"
)
}

View File

@@ -46,6 +46,9 @@ class RegExpTerm extends Locatable, @regexpterm {
/** Gets the number of child terms of this term. */
int getNumChild() { result = count(getAChild()) }
/** Gets the last child term of this term. */
RegExpTerm getLastChild() { result = getChild(getNumChild() - 1) }
/**
* Gets the parent term of this regular expression term, or the
* regular expression literal if this is the root term.
@@ -266,6 +269,20 @@ class RegExpSequence extends RegExpTerm, @regexp_seq {
}
}
/**
* A dollar `$` or caret assertion `^` matching the beginning or end of a line.
*
* Example:
*
* ```
* ^
* $
* ```
*/
class RegExpAnchor extends RegExpTerm, @regexp_anchor {
override predicate isNullable() { any() }
}
/**
* A caret assertion `^` matching the beginning of a line.
*
@@ -275,8 +292,7 @@ class RegExpSequence extends RegExpTerm, @regexp_seq {
* ^
* ```
*/
class RegExpCaret extends RegExpTerm, @regexp_caret {
override predicate isNullable() { any() }
class RegExpCaret extends RegExpAnchor, @regexp_caret {
}
/**
@@ -288,8 +304,7 @@ class RegExpCaret extends RegExpTerm, @regexp_caret {
* $
* ```
*/
class RegExpDollar extends RegExpTerm, @regexp_dollar {
override predicate isNullable() { any() }
class RegExpDollar extends RegExpAnchor, @regexp_dollar {
}
/**
@@ -814,26 +829,26 @@ abstract class RegExpPatternSource extends DataFlow::Node {
* of this node.
*/
abstract DataFlow::SourceNode getARegExpObject();
abstract RegExpTerm getRegExpTerm();
}
/**
* A regular expression literal, viewed as the pattern source for itself.
*/
private class RegExpLiteralPatternSource extends RegExpPatternSource {
string pattern;
RegExpLiteralPatternSource() {
exists(string raw | raw = asExpr().(RegExpLiteral).getRoot().getRawValue() |
// hide the fact that `/` is escaped in the literal
pattern = raw.regexpReplaceAll("\\\\/", "/")
)
}
private class RegExpLiteralPatternSource extends RegExpPatternSource, DataFlow::ValueNode {
override RegExpLiteral astNode;
override DataFlow::Node getAParse() { result = this }
override string getPattern() { result = pattern }
override string getPattern() {
// hide the fact that `/` is escaped in the literal
result = astNode.getRoot().getRawValue().regexpReplaceAll("\\\\/", "/")
}
override DataFlow::SourceNode getARegExpObject() { result = this }
override RegExpTerm getRegExpTerm() { result = astNode.getRoot() }
}
/**
@@ -856,4 +871,6 @@ private class StringRegExpPatternSource extends RegExpPatternSource {
}
override string getPattern() { result = getStringValue() }
override RegExpTerm getRegExpTerm() { result = asExpr().(StringLiteral).asRegExp() }
}

View File

@@ -858,6 +858,7 @@ regexpParseErrors (unique int id: @regexp_parse_error,
@regexp_lookahead = @regexp_positive_lookahead | @regexp_negative_lookahead;
@regexp_lookbehind = @regexp_positive_lookbehind | @regexp_negative_lookbehind;
@regexp_subpattern = @regexp_lookahead | @regexp_lookbehind;
@regexp_anchor = @regexp_dollar | @regexp_caret;
isGreedy (int id: @regexp_quantifier ref);
rangeQuantifierLowerBound (unique int id: @regexp_range ref, int lo: int ref);