JS: Update semi-anchored regex query

This commit is contained in:
Asger F
2019-10-16 22:15:57 +01:00
parent c21d095d38
commit 8bc89ee254
5 changed files with 138 additions and 37 deletions

View File

@@ -12,6 +12,84 @@
import javascript
/** Holds if `term` is one of the transitive left children of a regexp. */
predicate isLeftArmTerm(RegExpTerm term) {
term.isRootTerm()
or
exists(RegExpTerm parent |
term = parent.getChild(0) and
isLeftArmTerm(parent)
)
}
/** Holds if `term` is one of the transitive right children of a regexp. */
predicate isRightArmTerm(RegExpTerm term) {
term.isRootTerm()
or
exists(RegExpTerm parent |
term = parent.getLastChild() and
isRightArmTerm(parent)
)
}
/**
* Holds if `term` is an anchor that is not the first or last node
* in its tree.
*/
predicate isInteriorAnchor(RegExpAnchor term) {
not isLeftArmTerm(term) and
not isRightArmTerm(term)
}
/**
* Holds if `term` contains an anchor that is not the first or last node
* in its tree, such as `(foo|bar$|baz)`.
*/
predicate containsInteriorAnchor(RegExpTerm term) {
isInteriorAnchor(term.getAChild*())
}
/**
* Holds if `term` starts with a word boundary or lookbehind assertion,
* indicating that it's not intended to be anchored on that side.
*/
predicate containsLeadingPseudoAnchor(RegExpSequence term) {
exists(RegExpTerm child | child = term.getChild(0) |
child instanceof RegExpWordBoundary or
child instanceof RegExpNonWordBoundary or
child instanceof RegExpLookbehind
)
}
/**
* Holds if `term` ends with a word boundary or lookahead assertion,
* indicating that it's not intended to be anchored on that side.
*/
predicate containsTrailingPseudoAnchor(RegExpSequence term) {
exists(RegExpTerm child | child = term.getLastChild() |
child instanceof RegExpWordBoundary or
child instanceof RegExpNonWordBoundary or
child instanceof RegExpLookahead
)
}
/**
* Holds if `term` is an empty sequence, usually arising from
* literals with a trailing alternative such as `foo|`.
*/
predicate isEmpty(RegExpSequence term) {
term.getNumChild() = 0
}
/**
* Holds if `term` contains a letter constant.
*
* We use this as a heuristic to filter out uninteresting results.
*/
predicate containsLetters(RegExpTerm term) {
term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
}
/**
* Holds if `src` is a pattern for a collection of alternatives where
* only the first or last alternative is anchored, indicating a
@@ -21,23 +99,21 @@ import javascript
* parsed as `(^a)|(b)|(c)`.
*/
predicate isInterestingSemiAnchoredRegExpString(RegExpPatternSource src, string msg) {
exists(string str, string maybeGroupedStr, string regex, string anchorPart, string escapedDot |
// a dot that might be escaped in a regular expression, for example `/\./` or `new RegExp('\\.')`
escapedDot = "\\\\[.]" and
// a string that is mostly free from special reqular expression symbols
str = "(?:(?:" + escapedDot + ")|[a-z:/.?_,@0-9 -])+" and
// the string may be wrapped in parentheses
maybeGroupedStr = "(?:" + str + "|\\(" + str + "\\))" and
exists(RegExpAlt root, RegExpSequence anchoredTerm |
root = src.getRegExpTerm() and
not containsInteriorAnchor(root) and
not isEmpty(root.getAChild()) and
containsLetters(anchoredTerm) and
(
// a problematic pattern: `^a|b|...|x`
regex = "(?i)(\\^" + maybeGroupedStr + ")(?:\\|" + maybeGroupedStr + ")+"
anchoredTerm = root.getChild(0) and
anchoredTerm.getChild(0) instanceof RegExpCaret and
not containsLeadingPseudoAnchor(root.getChild([ 1 .. root.getNumChild() - 1 ]))
or
// a problematic pattern: `a|b|...|x$`
regex = "(?i)(?:" + maybeGroupedStr + "\\|)+(" + maybeGroupedStr + "\\$)"
anchoredTerm = root.getLastChild() and
anchoredTerm.getLastChild() instanceof RegExpDollar and
not containsTrailingPseudoAnchor(root.getChild([ 0 .. root.getNumChild() - 2 ]))
) and
anchorPart = src.getPattern().regexpCapture(regex, 1) and
anchorPart.regexpMatch("(?i).*[a-z].*") and
msg = "Misleading operator precedence. The subexpression '" + anchorPart +
msg = "Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
"' is anchored, but the other parts of this regular expression are not"
)
}

View File

@@ -46,6 +46,9 @@ class RegExpTerm extends Locatable, @regexpterm {
/** Gets the number of child terms of this term. */
int getNumChild() { result = count(getAChild()) }
/** Gets the last child term of this term. */
RegExpTerm getLastChild() { result = getChild(getNumChild() - 1) }
/**
* Gets the parent term of this regular expression term, or the
* regular expression literal if this is the root term.
@@ -266,6 +269,20 @@ class RegExpSequence extends RegExpTerm, @regexp_seq {
}
}
/**
* A dollar `$` or caret assertion `^` matching the beginning or end of a line.
*
* Example:
*
* ```
* ^
* $
* ```
*/
class RegExpAnchor extends RegExpTerm, @regexp_anchor {
override predicate isNullable() { any() }
}
/**
* A caret assertion `^` matching the beginning of a line.
*
@@ -275,8 +292,7 @@ class RegExpSequence extends RegExpTerm, @regexp_seq {
* ^
* ```
*/
class RegExpCaret extends RegExpTerm, @regexp_caret {
override predicate isNullable() { any() }
class RegExpCaret extends RegExpAnchor, @regexp_caret {
}
/**
@@ -288,8 +304,7 @@ class RegExpCaret extends RegExpTerm, @regexp_caret {
* $
* ```
*/
class RegExpDollar extends RegExpTerm, @regexp_dollar {
override predicate isNullable() { any() }
class RegExpDollar extends RegExpAnchor, @regexp_dollar {
}
/**
@@ -814,26 +829,26 @@ abstract class RegExpPatternSource extends DataFlow::Node {
* of this node.
*/
abstract DataFlow::SourceNode getARegExpObject();
abstract RegExpTerm getRegExpTerm();
}
/**
* A regular expression literal, viewed as the pattern source for itself.
*/
private class RegExpLiteralPatternSource extends RegExpPatternSource {
string pattern;
RegExpLiteralPatternSource() {
exists(string raw | raw = asExpr().(RegExpLiteral).getRoot().getRawValue() |
// hide the fact that `/` is escaped in the literal
pattern = raw.regexpReplaceAll("\\\\/", "/")
)
}
private class RegExpLiteralPatternSource extends RegExpPatternSource, DataFlow::ValueNode {
override RegExpLiteral astNode;
override DataFlow::Node getAParse() { result = this }
override string getPattern() { result = pattern }
override string getPattern() {
// hide the fact that `/` is escaped in the literal
result = astNode.getRoot().getRawValue().regexpReplaceAll("\\\\/", "/")
}
override DataFlow::SourceNode getARegExpObject() { result = this }
override RegExpTerm getRegExpTerm() { result = astNode.getRoot() }
}
/**
@@ -856,4 +871,6 @@ private class StringRegExpPatternSource extends RegExpPatternSource {
}
override string getPattern() { result = getStringValue() }
override RegExpTerm getRegExpTerm() { result = asExpr().(StringLiteral).asRegExp() }
}

View File

@@ -858,6 +858,7 @@ regexpParseErrors (unique int id: @regexp_parse_error,
@regexp_lookahead = @regexp_positive_lookahead | @regexp_negative_lookahead;
@regexp_lookbehind = @regexp_positive_lookbehind | @regexp_negative_lookbehind;
@regexp_subpattern = @regexp_lookahead | @regexp_lookbehind;
@regexp_anchor = @regexp_dollar | @regexp_caret;
isGreedy (int id: @regexp_quantifier ref);
rangeQuantifierLowerBound (unique int id: @regexp_range ref, int lo: int ref);

View File

@@ -8,6 +8,9 @@
| tst-SemiAnchoredRegExp.js:28:2:28:11 | /(a)\|(b)$/ | Misleading operator precedence. The subexpression '(b)$' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:30:2:30:23 | /^good. ... er.com/ | Misleading operator precedence. The subexpression '^good.com' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:31:2:31:25 | /^good\\ ... r\\.com/ | Misleading operator precedence. The subexpression '^good\\.com' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:32:2:32:27 | /^good\\ ... \\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\.com' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:33:2:33:29 | /^good\\ ... \\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\.com' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:34:2:34:31 | /^good\\ ... \\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\\\.com' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:39:13:39:18 | "^a\|b" | Misleading operator precedence. The subexpression '^a' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:42:13:42:20 | "^a\|b\|c" | Misleading operator precedence. The subexpression '^a' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:48:13:48:20 | "^a\|(b)" | Misleading operator precedence. The subexpression '^a' is anchored, but the other parts of this regular expression are not |
@@ -20,16 +23,20 @@
| tst-SemiAnchoredRegExp.js:67:13:67:36 | '^good\\ ... r\\.com' | Misleading operator precedence. The subexpression '^good.com' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:68:13:68:38 | '^good\\ ... \\\\.com' | Misleading operator precedence. The subexpression '^good\\.com' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:69:13:69:40 | '^good\\ ... \\\\.com' | Misleading operator precedence. The subexpression '^good\\.com' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:70:13:70:42 | '^good\\ ... \\\\.com' | Misleading operator precedence. The subexpression '^good\\\\.com' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:79:2:79:27 | /(\\.xxx ... .zzz)$/ | Misleading operator precedence. The subexpression '(\\.zzz)$' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:81:2:81:23 | /\\.xxx\| ... zzz$/ig | Misleading operator precedence. The subexpression '\\.zzz$' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:82:2:82:19 | /\\.xxx\|\\.yyy\|zzz$/ | Misleading operator precedence. The subexpression 'zzz$' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:83:2:83:31 | /^(?:mo ... \|click/ | Misleading operator precedence. The subexpression '^(?:mouse\|contextmenu)' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:85:2:85:28 | /^(xxx ... yyy)/i | Misleading operator precedence. The subexpression '^(xxx yyy zzz)' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:86:2:86:53 | /^(xxx ... x\|1st/i | Misleading operator precedence. The subexpression '^(xxx yyy zzz)' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:87:2:87:24 | /^(xxx: ... (zzz:)/ | Misleading operator precedence. The subexpression '^(xxx:)' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:88:2:88:23 | /^(xxx? ... zzz\\/)/ | Misleading operator precedence. The subexpression '^(xxx?:)' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:89:2:89:16 | /^@media\|@page/ | Misleading operator precedence. The subexpression '^@media' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:90:2:90:32 | /^\\s*(x ... :yyy\\// | Misleading operator precedence. The subexpression '^\\s*(xxx?\|yyy\|zzz):' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:91:2:91:21 | /^click\|mouse\|touch/ | Misleading operator precedence. The subexpression '^click' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:92:2:92:43 | /^http: ... r\\.com/ | Misleading operator precedence. The subexpression '^http://good\\.com' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:93:2:93:47 | /^https ... r\\.com/ | Misleading operator precedence. The subexpression '^https?://good\\.com' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:92:2:92:43 | /^http: ... r\\.com/ | Misleading operator precedence. The subexpression '^http:\\/\\/good\\.com' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:93:2:93:47 | /^https ... r\\.com/ | Misleading operator precedence. The subexpression '^https?:\\/\\/good\\.com' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:94:2:94:55 | /^mouse ... ragend/ | Misleading operator precedence. The subexpression '^mouse' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:95:2:95:14 | /^xxx:\|yyy:/i | Misleading operator precedence. The subexpression '^xxx:' is anchored, but the other parts of this regular expression are not |
| tst-SemiAnchoredRegExp.js:96:2:96:18 | /_xxx\|_yyy\|_zzz$/ | Misleading operator precedence. The subexpression '_zzz$' is anchored, but the other parts of this regular expression are not |

View File

@@ -29,9 +29,9 @@
/^good.com|better.com/; // NOT OK
/^good\.com|better\.com/; // NOT OK
/^good\\.com|better\\.com/;
/^good\\\.com|better\\\.com/;
/^good\\\\.com|better\\\\.com/;
/^good\\.com|better\\.com/; // NOT OK
/^good\\\.com|better\\\.com/; // NOT OK
/^good\\\\.com|better\\\\.com/; // NOT OK
});
(function coreString() {
@@ -67,7 +67,7 @@
new RegExp('^good\.com|better\.com'); // NOT OK
new RegExp('^good\\.com|better\\.com'); // NOT OK
new RegExp('^good\\\.com|better\\\.com'); // NOT OK
new RegExp('^good\\\\.com|better\\\\.com');
new RegExp('^good\\\\.com|better\\\\.com'); // NOT OK
});
(function realWorld() {
@@ -80,14 +80,14 @@
/(^left|right|center)\sbottom$/; // not flagged at the moment due to multiple anchors
/\.xxx|\.yyy|\.zzz$/ig;
/\.xxx|\.yyy|zzz$/;
/^(?:mouse|contextmenu)|click/; // not flagged at the moment due to nested alternatives
/^(?:mouse|contextmenu)|click/;
/^([A-Z]|xxx[XY]$)/; // not flagged at the moment due to multiple anchors
/^(xxx yyy zzz)|(xxx yyy)/i;
/^(xxx yyy zzz)|(xxx yyy)|(1st( xxx)? yyy)|xxx|1st/i; // not flagged at the moment due to nested parens
/^(xxx yyy zzz)|(xxx yyy)|(1st( xxx)? yyy)|xxx|1st/i;
/^(xxx:)|(yyy:)|(zzz:)/;
/^(xxx?:)|(yyy:zzz\/)/;
/^@media|@page/;
/^\s*(xxx?|yyy|zzz):|xxx:yyy\//; // not flagged at the moment due to quantifiers
/^\s*(xxx?|yyy|zzz):|xxx:yyy\//;
/^click|mouse|touch/;
/^http:\/\/good\.com|http:\/\/better\.com/;
/^https?:\/\/good\.com|https?:\/\/better\.com/;