mirror of
https://github.com/github/codeql.git
synced 2026-05-01 03:35:13 +02:00
JS: Update semi-anchored regex query
This commit is contained in:
@@ -12,6 +12,84 @@
|
||||
|
||||
import javascript
|
||||
|
||||
/** Holds if `term` is one of the transitive left children of a regexp. */
|
||||
predicate isLeftArmTerm(RegExpTerm term) {
|
||||
term.isRootTerm()
|
||||
or
|
||||
exists(RegExpTerm parent |
|
||||
term = parent.getChild(0) and
|
||||
isLeftArmTerm(parent)
|
||||
)
|
||||
}
|
||||
|
||||
/** Holds if `term` is one of the transitive right children of a regexp. */
|
||||
predicate isRightArmTerm(RegExpTerm term) {
|
||||
term.isRootTerm()
|
||||
or
|
||||
exists(RegExpTerm parent |
|
||||
term = parent.getLastChild() and
|
||||
isRightArmTerm(parent)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `term` is an anchor that is not the first or last node
|
||||
* in its tree.
|
||||
*/
|
||||
predicate isInteriorAnchor(RegExpAnchor term) {
|
||||
not isLeftArmTerm(term) and
|
||||
not isRightArmTerm(term)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `term` contains an anchor that is not the first or last node
|
||||
* in its tree, such as `(foo|bar$|baz)`.
|
||||
*/
|
||||
predicate containsInteriorAnchor(RegExpTerm term) {
|
||||
isInteriorAnchor(term.getAChild*())
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `term` starts with a word boundary or lookbehind assertion,
|
||||
* indicating that it's not intended to be anchored on that side.
|
||||
*/
|
||||
predicate containsLeadingPseudoAnchor(RegExpSequence term) {
|
||||
exists(RegExpTerm child | child = term.getChild(0) |
|
||||
child instanceof RegExpWordBoundary or
|
||||
child instanceof RegExpNonWordBoundary or
|
||||
child instanceof RegExpLookbehind
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `term` ends with a word boundary or lookahead assertion,
|
||||
* indicating that it's not intended to be anchored on that side.
|
||||
*/
|
||||
predicate containsTrailingPseudoAnchor(RegExpSequence term) {
|
||||
exists(RegExpTerm child | child = term.getLastChild() |
|
||||
child instanceof RegExpWordBoundary or
|
||||
child instanceof RegExpNonWordBoundary or
|
||||
child instanceof RegExpLookahead
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `term` is an empty sequence, usually arising from
|
||||
* literals with a trailing alternative such as `foo|`.
|
||||
*/
|
||||
predicate isEmpty(RegExpSequence term) {
|
||||
term.getNumChild() = 0
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `term` contains a letter constant.
|
||||
*
|
||||
* We use this as a heuristic to filter out uninteresting results.
|
||||
*/
|
||||
predicate containsLetters(RegExpTerm term) {
|
||||
term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds if `src` is a pattern for a collection of alternatives where
|
||||
* only the first or last alternative is anchored, indicating a
|
||||
@@ -21,23 +99,21 @@ import javascript
|
||||
* parsed as `(^a)|(b)|(c)`.
|
||||
*/
|
||||
predicate isInterestingSemiAnchoredRegExpString(RegExpPatternSource src, string msg) {
|
||||
exists(string str, string maybeGroupedStr, string regex, string anchorPart, string escapedDot |
|
||||
// a dot that might be escaped in a regular expression, for example `/\./` or `new RegExp('\\.')`
|
||||
escapedDot = "\\\\[.]" and
|
||||
// a string that is mostly free from special reqular expression symbols
|
||||
str = "(?:(?:" + escapedDot + ")|[a-z:/.?_,@0-9 -])+" and
|
||||
// the string may be wrapped in parentheses
|
||||
maybeGroupedStr = "(?:" + str + "|\\(" + str + "\\))" and
|
||||
exists(RegExpAlt root, RegExpSequence anchoredTerm |
|
||||
root = src.getRegExpTerm() and
|
||||
not containsInteriorAnchor(root) and
|
||||
not isEmpty(root.getAChild()) and
|
||||
containsLetters(anchoredTerm) and
|
||||
(
|
||||
// a problematic pattern: `^a|b|...|x`
|
||||
regex = "(?i)(\\^" + maybeGroupedStr + ")(?:\\|" + maybeGroupedStr + ")+"
|
||||
anchoredTerm = root.getChild(0) and
|
||||
anchoredTerm.getChild(0) instanceof RegExpCaret and
|
||||
not containsLeadingPseudoAnchor(root.getChild([ 1 .. root.getNumChild() - 1 ]))
|
||||
or
|
||||
// a problematic pattern: `a|b|...|x$`
|
||||
regex = "(?i)(?:" + maybeGroupedStr + "\\|)+(" + maybeGroupedStr + "\\$)"
|
||||
anchoredTerm = root.getLastChild() and
|
||||
anchoredTerm.getLastChild() instanceof RegExpDollar and
|
||||
not containsTrailingPseudoAnchor(root.getChild([ 0 .. root.getNumChild() - 2 ]))
|
||||
) and
|
||||
anchorPart = src.getPattern().regexpCapture(regex, 1) and
|
||||
anchorPart.regexpMatch("(?i).*[a-z].*") and
|
||||
msg = "Misleading operator precedence. The subexpression '" + anchorPart +
|
||||
msg = "Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
|
||||
"' is anchored, but the other parts of this regular expression are not"
|
||||
)
|
||||
}
|
||||
|
||||
@@ -46,6 +46,9 @@ class RegExpTerm extends Locatable, @regexpterm {
|
||||
/** Gets the number of child terms of this term. */
|
||||
int getNumChild() { result = count(getAChild()) }
|
||||
|
||||
/** Gets the last child term of this term. */
|
||||
RegExpTerm getLastChild() { result = getChild(getNumChild() - 1) }
|
||||
|
||||
/**
|
||||
* Gets the parent term of this regular expression term, or the
|
||||
* regular expression literal if this is the root term.
|
||||
@@ -266,6 +269,20 @@ class RegExpSequence extends RegExpTerm, @regexp_seq {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A dollar `$` or caret assertion `^` matching the beginning or end of a line.
|
||||
*
|
||||
* Example:
|
||||
*
|
||||
* ```
|
||||
* ^
|
||||
* $
|
||||
* ```
|
||||
*/
|
||||
class RegExpAnchor extends RegExpTerm, @regexp_anchor {
|
||||
override predicate isNullable() { any() }
|
||||
}
|
||||
|
||||
/**
|
||||
* A caret assertion `^` matching the beginning of a line.
|
||||
*
|
||||
@@ -275,8 +292,7 @@ class RegExpSequence extends RegExpTerm, @regexp_seq {
|
||||
* ^
|
||||
* ```
|
||||
*/
|
||||
class RegExpCaret extends RegExpTerm, @regexp_caret {
|
||||
override predicate isNullable() { any() }
|
||||
class RegExpCaret extends RegExpAnchor, @regexp_caret {
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -288,8 +304,7 @@ class RegExpCaret extends RegExpTerm, @regexp_caret {
|
||||
* $
|
||||
* ```
|
||||
*/
|
||||
class RegExpDollar extends RegExpTerm, @regexp_dollar {
|
||||
override predicate isNullable() { any() }
|
||||
class RegExpDollar extends RegExpAnchor, @regexp_dollar {
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -814,26 +829,26 @@ abstract class RegExpPatternSource extends DataFlow::Node {
|
||||
* of this node.
|
||||
*/
|
||||
abstract DataFlow::SourceNode getARegExpObject();
|
||||
|
||||
abstract RegExpTerm getRegExpTerm();
|
||||
}
|
||||
|
||||
/**
|
||||
* A regular expression literal, viewed as the pattern source for itself.
|
||||
*/
|
||||
private class RegExpLiteralPatternSource extends RegExpPatternSource {
|
||||
string pattern;
|
||||
|
||||
RegExpLiteralPatternSource() {
|
||||
exists(string raw | raw = asExpr().(RegExpLiteral).getRoot().getRawValue() |
|
||||
// hide the fact that `/` is escaped in the literal
|
||||
pattern = raw.regexpReplaceAll("\\\\/", "/")
|
||||
)
|
||||
}
|
||||
private class RegExpLiteralPatternSource extends RegExpPatternSource, DataFlow::ValueNode {
|
||||
override RegExpLiteral astNode;
|
||||
|
||||
override DataFlow::Node getAParse() { result = this }
|
||||
|
||||
override string getPattern() { result = pattern }
|
||||
override string getPattern() {
|
||||
// hide the fact that `/` is escaped in the literal
|
||||
result = astNode.getRoot().getRawValue().regexpReplaceAll("\\\\/", "/")
|
||||
}
|
||||
|
||||
override DataFlow::SourceNode getARegExpObject() { result = this }
|
||||
|
||||
override RegExpTerm getRegExpTerm() { result = astNode.getRoot() }
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -856,4 +871,6 @@ private class StringRegExpPatternSource extends RegExpPatternSource {
|
||||
}
|
||||
|
||||
override string getPattern() { result = getStringValue() }
|
||||
|
||||
override RegExpTerm getRegExpTerm() { result = asExpr().(StringLiteral).asRegExp() }
|
||||
}
|
||||
|
||||
@@ -858,6 +858,7 @@ regexpParseErrors (unique int id: @regexp_parse_error,
|
||||
@regexp_lookahead = @regexp_positive_lookahead | @regexp_negative_lookahead;
|
||||
@regexp_lookbehind = @regexp_positive_lookbehind | @regexp_negative_lookbehind;
|
||||
@regexp_subpattern = @regexp_lookahead | @regexp_lookbehind;
|
||||
@regexp_anchor = @regexp_dollar | @regexp_caret;
|
||||
|
||||
isGreedy (int id: @regexp_quantifier ref);
|
||||
rangeQuantifierLowerBound (unique int id: @regexp_range ref, int lo: int ref);
|
||||
|
||||
@@ -8,6 +8,9 @@
|
||||
| tst-SemiAnchoredRegExp.js:28:2:28:11 | /(a)\|(b)$/ | Misleading operator precedence. The subexpression '(b)$' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:30:2:30:23 | /^good. ... er.com/ | Misleading operator precedence. The subexpression '^good.com' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:31:2:31:25 | /^good\\ ... r\\.com/ | Misleading operator precedence. The subexpression '^good\\.com' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:32:2:32:27 | /^good\\ ... \\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\.com' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:33:2:33:29 | /^good\\ ... \\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\.com' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:34:2:34:31 | /^good\\ ... \\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\\\.com' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:39:13:39:18 | "^a\|b" | Misleading operator precedence. The subexpression '^a' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:42:13:42:20 | "^a\|b\|c" | Misleading operator precedence. The subexpression '^a' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:48:13:48:20 | "^a\|(b)" | Misleading operator precedence. The subexpression '^a' is anchored, but the other parts of this regular expression are not |
|
||||
@@ -20,16 +23,20 @@
|
||||
| tst-SemiAnchoredRegExp.js:67:13:67:36 | '^good\\ ... r\\.com' | Misleading operator precedence. The subexpression '^good.com' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:68:13:68:38 | '^good\\ ... \\\\.com' | Misleading operator precedence. The subexpression '^good\\.com' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:69:13:69:40 | '^good\\ ... \\\\.com' | Misleading operator precedence. The subexpression '^good\\.com' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:70:13:70:42 | '^good\\ ... \\\\.com' | Misleading operator precedence. The subexpression '^good\\\\.com' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:79:2:79:27 | /(\\.xxx ... .zzz)$/ | Misleading operator precedence. The subexpression '(\\.zzz)$' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:81:2:81:23 | /\\.xxx\| ... zzz$/ig | Misleading operator precedence. The subexpression '\\.zzz$' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:82:2:82:19 | /\\.xxx\|\\.yyy\|zzz$/ | Misleading operator precedence. The subexpression 'zzz$' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:83:2:83:31 | /^(?:mo ... \|click/ | Misleading operator precedence. The subexpression '^(?:mouse\|contextmenu)' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:85:2:85:28 | /^(xxx ... yyy)/i | Misleading operator precedence. The subexpression '^(xxx yyy zzz)' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:86:2:86:53 | /^(xxx ... x\|1st/i | Misleading operator precedence. The subexpression '^(xxx yyy zzz)' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:87:2:87:24 | /^(xxx: ... (zzz:)/ | Misleading operator precedence. The subexpression '^(xxx:)' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:88:2:88:23 | /^(xxx? ... zzz\\/)/ | Misleading operator precedence. The subexpression '^(xxx?:)' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:89:2:89:16 | /^@media\|@page/ | Misleading operator precedence. The subexpression '^@media' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:90:2:90:32 | /^\\s*(x ... :yyy\\// | Misleading operator precedence. The subexpression '^\\s*(xxx?\|yyy\|zzz):' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:91:2:91:21 | /^click\|mouse\|touch/ | Misleading operator precedence. The subexpression '^click' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:92:2:92:43 | /^http: ... r\\.com/ | Misleading operator precedence. The subexpression '^http://good\\.com' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:93:2:93:47 | /^https ... r\\.com/ | Misleading operator precedence. The subexpression '^https?://good\\.com' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:92:2:92:43 | /^http: ... r\\.com/ | Misleading operator precedence. The subexpression '^http:\\/\\/good\\.com' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:93:2:93:47 | /^https ... r\\.com/ | Misleading operator precedence. The subexpression '^https?:\\/\\/good\\.com' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:94:2:94:55 | /^mouse ... ragend/ | Misleading operator precedence. The subexpression '^mouse' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:95:2:95:14 | /^xxx:\|yyy:/i | Misleading operator precedence. The subexpression '^xxx:' is anchored, but the other parts of this regular expression are not |
|
||||
| tst-SemiAnchoredRegExp.js:96:2:96:18 | /_xxx\|_yyy\|_zzz$/ | Misleading operator precedence. The subexpression '_zzz$' is anchored, but the other parts of this regular expression are not |
|
||||
|
||||
@@ -29,9 +29,9 @@
|
||||
|
||||
/^good.com|better.com/; // NOT OK
|
||||
/^good\.com|better\.com/; // NOT OK
|
||||
/^good\\.com|better\\.com/;
|
||||
/^good\\\.com|better\\\.com/;
|
||||
/^good\\\\.com|better\\\\.com/;
|
||||
/^good\\.com|better\\.com/; // NOT OK
|
||||
/^good\\\.com|better\\\.com/; // NOT OK
|
||||
/^good\\\\.com|better\\\\.com/; // NOT OK
|
||||
});
|
||||
|
||||
(function coreString() {
|
||||
@@ -67,7 +67,7 @@
|
||||
new RegExp('^good\.com|better\.com'); // NOT OK
|
||||
new RegExp('^good\\.com|better\\.com'); // NOT OK
|
||||
new RegExp('^good\\\.com|better\\\.com'); // NOT OK
|
||||
new RegExp('^good\\\\.com|better\\\\.com');
|
||||
new RegExp('^good\\\\.com|better\\\\.com'); // NOT OK
|
||||
});
|
||||
|
||||
(function realWorld() {
|
||||
@@ -80,14 +80,14 @@
|
||||
/(^left|right|center)\sbottom$/; // not flagged at the moment due to multiple anchors
|
||||
/\.xxx|\.yyy|\.zzz$/ig;
|
||||
/\.xxx|\.yyy|zzz$/;
|
||||
/^(?:mouse|contextmenu)|click/; // not flagged at the moment due to nested alternatives
|
||||
/^(?:mouse|contextmenu)|click/;
|
||||
/^([A-Z]|xxx[XY]$)/; // not flagged at the moment due to multiple anchors
|
||||
/^(xxx yyy zzz)|(xxx yyy)/i;
|
||||
/^(xxx yyy zzz)|(xxx yyy)|(1st( xxx)? yyy)|xxx|1st/i; // not flagged at the moment due to nested parens
|
||||
/^(xxx yyy zzz)|(xxx yyy)|(1st( xxx)? yyy)|xxx|1st/i;
|
||||
/^(xxx:)|(yyy:)|(zzz:)/;
|
||||
/^(xxx?:)|(yyy:zzz\/)/;
|
||||
/^@media|@page/;
|
||||
/^\s*(xxx?|yyy|zzz):|xxx:yyy\//; // not flagged at the moment due to quantifiers
|
||||
/^\s*(xxx?|yyy|zzz):|xxx:yyy\//;
|
||||
/^click|mouse|touch/;
|
||||
/^http:\/\/good\.com|http:\/\/better\.com/;
|
||||
/^https?:\/\/good\.com|https?:\/\/better\.com/;
|
||||
|
||||
Reference in New Issue
Block a user