From 992876276950e88af6c1769f0114b7226e2db649 Mon Sep 17 00:00:00 2001 From: Asger F Date: Fri, 3 Jan 2020 16:31:03 +0000 Subject: [PATCH] JS: Add RegExpAlwaysMatches query --- change-notes/1.24/analysis-javascript.md | 1 + .../config/suites/javascript/correctness-core | 1 + javascript/ql/src/Performance/ReDoS.ql | 21 +--- .../ql/src/RegExp/RegExpAlwaysMatches.qhelp | 54 ++++++++ .../ql/src/RegExp/RegExpAlwaysMatches.ql | 117 ++++++++++++++++++ .../RegExp/examples/RegExpAlwaysMatches.js | 3 + .../examples/RegExpAlwaysMatchesGood.js | 3 + .../ql/src/semmle/javascript/Regexp.qll | 17 +++ .../RegExpAlwaysMatches.expected | 5 + .../RegExpAlwaysMatches.qlref | 1 + .../RegExp/RegExpAlwaysMatches/tst.js | 67 ++++++++++ 11 files changed, 271 insertions(+), 19 deletions(-) create mode 100644 javascript/ql/src/RegExp/RegExpAlwaysMatches.qhelp create mode 100644 javascript/ql/src/RegExp/RegExpAlwaysMatches.ql create mode 100644 javascript/ql/src/RegExp/examples/RegExpAlwaysMatches.js create mode 100644 javascript/ql/src/RegExp/examples/RegExpAlwaysMatchesGood.js create mode 100644 javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.expected create mode 100644 javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.qlref create mode 100644 javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/tst.js diff --git a/change-notes/1.24/analysis-javascript.md b/change-notes/1.24/analysis-javascript.md index 20dd26299e1..65076c235ff 100644 --- a/change-notes/1.24/analysis-javascript.md +++ b/change-notes/1.24/analysis-javascript.md @@ -17,6 +17,7 @@ | **Query** | **Tags** | **Purpose** | |---------------------------------------------------------------------------------|-------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | Cross-site scripting through exception (`js/xss-through-exception`) | security, external/cwe/cwe-079, external/cwe/cwe-116 | Highlights potential XSS vulnerabilities where an exception is written to the DOM. Results are not shown on LGTM by default. | +| Regular expression always matches (`js/regex/always-matches`) | correctness, regular-expressions | Highlights regular expression checks that trivially succeed by matching an empty substring. | ## Changes to existing queries diff --git a/javascript/config/suites/javascript/correctness-core b/javascript/config/suites/javascript/correctness-core index d6b91e83a07..94ca3d8e054 100644 --- a/javascript/config/suites/javascript/correctness-core +++ b/javascript/config/suites/javascript/correctness-core @@ -34,6 +34,7 @@ + semmlecode-javascript-queries/RegExp/DuplicateCharacterInCharacterClass.ql: /Correctness/Regular Expressions + semmlecode-javascript-queries/RegExp/EmptyCharacterClass.ql: /Correctness/Regular Expressions + semmlecode-javascript-queries/RegExp/IdentityReplacement.ql: /Correctness/Regular Expressions ++ semmlecode-javascript-queries/RegExp/RegExpAlwaysMatches.ql: /Correctness/Regular Expressions + semmlecode-javascript-queries/RegExp/UnboundBackref.ql: /Correctness/Regular Expressions + semmlecode-javascript-queries/RegExp/UnmatchableCaret.ql: /Correctness/Regular Expressions + semmlecode-javascript-queries/RegExp/UnmatchableDollar.ql: /Correctness/Regular Expressions diff --git a/javascript/ql/src/Performance/ReDoS.ql b/javascript/ql/src/Performance/ReDoS.ql index 2826306e5d2..7f74dcb47d5 100644 --- a/javascript/ql/src/Performance/ReDoS.ql +++ b/javascript/ql/src/Performance/ReDoS.ql @@ -144,7 +144,7 @@ newtype TInputSymbol = CharClass(RegExpCharacterClass recc) { getRoot(recc).isRelevant() and not recc.isInverted() and - not isUniversalClass(recc) + not recc.isUniversalClass() } or /** An input symbol representing all characters matched by `.`. */ Dot() or @@ -153,23 +153,6 @@ newtype TInputSymbol = /** An epsilon transition in the automaton. */ Epsilon() -/** - * Holds if character class `cc` matches all characters. - */ -predicate isUniversalClass(RegExpCharacterClass cc) { - // [^] - cc.isInverted() and not exists(cc.getAChild()) - or - // [\w\W] and similar - not cc.isInverted() and - exists(string cce1, string cce2 | - cce1 = cc.getAChild().(RegExpCharacterClassEscape).getValue() and - cce2 = cc.getAChild().(RegExpCharacterClassEscape).getValue() - | - cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase() - ) -} - /** * An abstract input symbol, representing a set of concrete characters. */ @@ -361,7 +344,7 @@ predicate delta(State q1, EdgeLabel lbl, State q2) { ) or exists(RegExpCharacterClass cc | - isUniversalClass(cc) and q1 = before(cc) and lbl = Any() and q2 = after(cc) + cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc) or q1 = before(cc) and lbl = CharClass(cc) and q2 = after(cc) ) diff --git a/javascript/ql/src/RegExp/RegExpAlwaysMatches.qhelp b/javascript/ql/src/RegExp/RegExpAlwaysMatches.qhelp new file mode 100644 index 00000000000..cf31c172862 --- /dev/null +++ b/javascript/ql/src/RegExp/RegExpAlwaysMatches.qhelp @@ -0,0 +1,54 @@ + + + + +

+There are several built-in JavaScript functions that search for a regular expression match within a string, +such as RegExp.prototype.test and String.prototype.search. +If the regular expression is not anchored, the regular expression does not need to match the whole string; +it only needs to match a substring. +

+ +

+If the regular expression being searched for accepts the empty string, this means it can match an empty +substring anywhere in the input string, and will thus always find a match. +In this case, testing if a match exists is redundant and indicates dead code. +

+ +
+ + +

+Examine the regular expression and determine how it was intended to match: +

    +
  • To match the whole input string, add anchors at the beginning and end of the regular expression.
  • +
  • To search for an occurrence within the input string, consider what the shortest meaningful match is and restrict the +regular expression accordingly, such as by changing a * to a +.
  • +
+

+ +
+ +

+In the following example, a regular expression is used to check the format of an string id. +However, the check always passes because the regular expression can match the empty substring. +For example, it will allow the ID string "%%" by matching an empty string at index 0. +

+ + + +

+To ensure the regular expression matches the whole string, add anchors at the beginning and end: +

+ + + +
+ + +
  • Mozilla Developer Network: JavaScript Regular Expressions.
  • + +
    +
    diff --git a/javascript/ql/src/RegExp/RegExpAlwaysMatches.ql b/javascript/ql/src/RegExp/RegExpAlwaysMatches.ql new file mode 100644 index 00000000000..b3eee6c6776 --- /dev/null +++ b/javascript/ql/src/RegExp/RegExpAlwaysMatches.ql @@ -0,0 +1,117 @@ +/** + * @name Regular expression always matches + * @description Regular expression tests checks always find a match indicate dead code or a logic error + * @kind problem + * @problem.severity warning + * @id js/regex/always-matches + * @tags correctness + * regular-expressions + * @precision high + */ + +import javascript + +/** + * Gets a node reachable from the given root term through alts and groups only. + * + * For example, for `/(foo|bar)/` this gets `(foo|bar)`, `foo|bar`, `foo` and `bar`. + */ +RegExpTerm getEffectiveRootAux(RegExpTerm actualRoot) { + actualRoot.isRootTerm() and + result = actualRoot + or + result = getEffectiveRootAux(actualRoot).(RegExpAlt).getAChild() + or + result = getEffectiveRootAux(actualRoot).(RegExpGroup).getAChild() +} + +/** + * Gets the effective root of the given term. + * + * For example, for `/(foo|bar)/` this gets `foo` and `bar`. + */ +RegExpTerm getEffectiveRoot(RegExpTerm actualRoot) { + result = getEffectiveRootAux(actualRoot) and + not result instanceof RegExpAlt and + not result instanceof RegExpGroup +} + +/** + * Holds if `term` contains an anchor on both ends. + */ +predicate isPossiblyAnchoredOnBothEnds(RegExpSequence node) { + node.getAChild*() instanceof RegExpCaret and + node.getAChild*() instanceof RegExpDollar and + node.getNumChild() >= 2 +} + +/** + * Holds if `term` is obviously intended to match any string. + */ +predicate isUniversalRegExp(RegExpTerm term) { + exists(RegExpTerm child | child = term.(RegExpStar).getAChild() | + child instanceof RegExpDot + or + child.(RegExpCharacterClass).isUniversalClass() + ) +} + +/** + * A call that searches for a regexp match within a string, but does not + * extract the capture groups or the matched string itself. + * + * Because of the longest-match rule, queries that are more than pure tests + * aren't necessarily broken just because the regexp can accept the empty string. + */ +abstract class RegExpQuery extends DataFlow::CallNode { + abstract RegExpTerm getRegExp(); +} + +/** + * A call to `RegExp.prototype.test`. + */ +class RegExpTestCall extends DataFlow::MethodCallNode, RegExpQuery { + DataFlow::RegExpCreationNode regexp; + + RegExpTestCall() { + this = regexp.getAReference().getAMethodCall("test") + } + + override RegExpTerm getRegExp() { + result = regexp.getRoot() + } +} + +/** + * A call to `String.prototype.search`. + */ +class RegExpSearchCall extends DataFlow::MethodCallNode, RegExpQuery { + DataFlow::RegExpCreationNode regexp; + + RegExpSearchCall() { + getMethodName() = "search" and + regexp.getAReference().flowsTo(getArgument(0)) + } + + override RegExpTerm getRegExp() { + result = regexp.getRoot() + } +} + +from RegExpTerm term, RegExpQuery call, string message +where + term.isNullable() and + not term.getAChild() instanceof RegExpSubPattern and + not isUniversalRegExp(term) and + term = getEffectiveRoot(call.getRegExp()) and + ( + call instanceof RegExpTestCall and + not isPossiblyAnchoredOnBothEnds(term) and + message = "This regular expression always matches when used in a test $@, as it can match an empty substring." + or + call instanceof RegExpSearchCall and + not term.getAChild*() instanceof RegExpDollar and + not term.getAChild*() instanceof RegExpSubPattern and + message = "This regular expression always the matches at index 0 when used $@, as it matches the empty substring." + ) +select term, message, call, "here" diff --git a/javascript/ql/src/RegExp/examples/RegExpAlwaysMatches.js b/javascript/ql/src/RegExp/examples/RegExpAlwaysMatches.js new file mode 100644 index 00000000000..29e779dab79 --- /dev/null +++ b/javascript/ql/src/RegExp/examples/RegExpAlwaysMatches.js @@ -0,0 +1,3 @@ +if (!/[a-z0-9]*/.test(id)) { + throw new Error("Invalid id: " + id); +} diff --git a/javascript/ql/src/RegExp/examples/RegExpAlwaysMatchesGood.js b/javascript/ql/src/RegExp/examples/RegExpAlwaysMatchesGood.js new file mode 100644 index 00000000000..878f8082f78 --- /dev/null +++ b/javascript/ql/src/RegExp/examples/RegExpAlwaysMatchesGood.js @@ -0,0 +1,3 @@ +if (!/^[a-z0-9]*$/.test(id)) { + throw new Error("Invalid id: " + id); +} diff --git a/javascript/ql/src/semmle/javascript/Regexp.qll b/javascript/ql/src/semmle/javascript/Regexp.qll index ae6c541b21c..29e5a1bae3e 100644 --- a/javascript/ql/src/semmle/javascript/Regexp.qll +++ b/javascript/ql/src/semmle/javascript/Regexp.qll @@ -764,6 +764,23 @@ class RegExpCharacterClass extends RegExpTerm, @regexp_char_class { override string getAMatchedString() { not isInverted() and result = getAChild().getAMatchedString() } + + /** + * Holds if this character class matches any character. + */ + predicate isUniversalClass() { + // [^] + isInverted() and not exists(getAChild()) + or + // [\w\W] and similar + not isInverted() and + exists(string cce1, string cce2 | + cce1 = getAChild().(RegExpCharacterClassEscape).getValue() and + cce2 = getAChild().(RegExpCharacterClassEscape).getValue() + | + cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase() + ) + } } /** diff --git a/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.expected b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.expected new file mode 100644 index 00000000000..1a15b8702d3 --- /dev/null +++ b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.expected @@ -0,0 +1,5 @@ +| tst.js:2:11:2:20 | ^(https:)? | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:2:10:2:29 | /^(https:)?/.test(x) | here | +| tst.js:14:11:14:19 | (\\.com)?$ | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:14:10:14:28 | /(\\.com)?$/.test(x) | here | +| tst.js:22:11:22:34 | ^(?:https?:\|ftp:\|file:)? | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:22:10:22:43 | /^(?:ht ... test(x) | here | +| tst.js:30:11:30:20 | (foo\|bar)? | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:30:10:30:29 | /(foo\|bar)?/.test(x) | here | +| tst.js:34:21:34:26 | (baz)? | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:34:10:34:35 | /^foo\|b ... test(x) | here | diff --git a/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.qlref b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.qlref new file mode 100644 index 00000000000..acdde814bbc --- /dev/null +++ b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.qlref @@ -0,0 +1 @@ +RegExp/RegExpAlwaysMatches.ql \ No newline at end of file diff --git a/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/tst.js b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/tst.js new file mode 100644 index 00000000000..4f03b5ff70f --- /dev/null +++ b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/tst.js @@ -0,0 +1,67 @@ +function optionalPrefix(x) { + return /^(https:)?/.test(x); // NOT OK +} + +function mandatoryPrefix(x) { + return /^https:/.test(x); // NOT OK +} + +function httpOrHttps(x) { + return /^https?:/.test(x); // OK +} + +function optionalSuffix(x) { + return /(\.com)?$/.test(x); // NOT OK +} + +function mandatorySuffix(x) { + return /\.com$/.test(x); // OK +} + +function protocol(x) { + return /^(?:https?:|ftp:|file:)?/.test(x); // NOT OK +} + +function doubleAnchored(x) { + return /^(foo|bar)?$/.test(x); // OK +} + +function noAnchor(x) { + return /(foo|bar)?/.test(x); // NOT OK +} + +function altAnchor(x) { + return /^foo|bar$|(baz)?/.test(x); // NOT OK +} + +function wildcard(x) { + return /.*/.test(x); // OK - obviously intended to match anything +} + +function wildcard2(x) { + return /[\d\D]*/.test(x); // OK - obviously intended to match anything +} + +function emptyAlt(x) { + return /^$|foo|bar/.test(x); // OK +} + +function emptyAlt2(x) { + return /(^$|foo|bar)/.test(x); // OK +} + +function emptyAlt3(x) { + return /((^$|foo|bar))/.test(x); // OK +} + +function search(x) { + return /[a-z]*/.search(x); // NOT OK +} + +function search2(x) { + return /[a-z]/.search(x); // OK +} + +function lookahead(x) { + return /(?!x)/.search(x); // OK +}