JS: Add RegExpAlwaysMatches query

2025-12-16 16:53:25 +01:00 · 2020-01-03 16:31:03 +00:00
parent 48deb30756
commit 9928762769
11 changed files with 271 additions and 19 deletions
--- a/change-notes/1.24/analysis-javascript.md
+++ b/change-notes/1.24/analysis-javascript.md
@@ -17,6 +17,7 @@
 | **Query**                                                                       | **Tags**                                                          | **Purpose**                                                                                                                                                                            |
 |---------------------------------------------------------------------------------|-------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | Cross-site scripting through exception (`js/xss-through-exception`) | security, external/cwe/cwe-079, external/cwe/cwe-116              | Highlights potential XSS vulnerabilities where an exception is written to the DOM. Results are not shown on LGTM by default. |
 | Regular expression always matches (`js/regex/always-matches`) | correctness, regular-expressions | Highlights regular expression checks that trivially succeed by matching an empty substring. |
 ## Changes to existing queries
--- a/javascript/config/suites/javascript/correctness-core
+++ b/javascript/config/suites/javascript/correctness-core
@@ -34,6 +34,7 @@
 + semmlecode-javascript-queries/RegExp/DuplicateCharacterInCharacterClass.ql: /Correctness/Regular Expressions
 + semmlecode-javascript-queries/RegExp/EmptyCharacterClass.ql: /Correctness/Regular Expressions
 + semmlecode-javascript-queries/RegExp/IdentityReplacement.ql: /Correctness/Regular Expressions
 + semmlecode-javascript-queries/RegExp/RegExpAlwaysMatches.ql: /Correctness/Regular Expressions
 + semmlecode-javascript-queries/RegExp/UnboundBackref.ql: /Correctness/Regular Expressions
 + semmlecode-javascript-queries/RegExp/UnmatchableCaret.ql: /Correctness/Regular Expressions
 + semmlecode-javascript-queries/RegExp/UnmatchableDollar.ql: /Correctness/Regular Expressions
--- a/javascript/ql/src/Performance/ReDoS.ql
+++ b/javascript/ql/src/Performance/ReDoS.ql
@@ -144,7 +144,7 @@ newtype TInputSymbol =
  CharClass(RegExpCharacterClass recc) {
    getRoot(recc).isRelevant() and
    not recc.isInverted() and
-    not isUniversalClass(recc)
+    not recc.isUniversalClass()
  } or
  /** An input symbol representing all characters matched by `.`. */
  Dot() or
@@ -153,23 +153,6 @@ newtype TInputSymbol =
  /** An epsilon transition in the automaton. */
  Epsilon()
 /**
 * Holds if character class `cc` matches all characters.
 */
 predicate isUniversalClass(RegExpCharacterClass cc) {
  // [^]
  cc.isInverted() and not exists(cc.getAChild())
  or
  // [\w\W] and similar
  not cc.isInverted() and
  exists(string cce1, string cce2 |
    cce1 = cc.getAChild().(RegExpCharacterClassEscape).getValue() and
    cce2 = cc.getAChild().(RegExpCharacterClassEscape).getValue()
  |
    cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase()
  )
 }
 /**
 * An abstract input symbol, representing a set of concrete characters.
 */
@@ -361,7 +344,7 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
  )
  or
  exists(RegExpCharacterClass cc |
-    isUniversalClass(cc) and q1 = before(cc) and lbl = Any() and q2 = after(cc)
+    cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
    or
    q1 = before(cc) and lbl = CharClass(cc) and q2 = after(cc)
  )
--- a/javascript/ql/src/RegExp/RegExpAlwaysMatches.qhelp
+++ b/javascript/ql/src/RegExp/RegExpAlwaysMatches.qhelp
@@ -0,0 +1,54 @@
 <!DOCTYPE qhelp PUBLIC
  "-//Semmle//qhelp//EN"
  "qhelp.dtd">
 <qhelp>
 <overview>
 <p>
 There are several built-in JavaScript functions that search for a regular expression match within a string,
 such as <code>RegExp.prototype.test</code> and <code>String.prototype.search</code>.
 If the regular expression is not anchored, the regular expression does not need to match the whole string;
 it only needs to match a substring.
 </p>
 <p>
 If the regular expression being searched for accepts the empty string, this means it can match an empty
 substring anywhere in the input string, and will thus always find a match.
 In this case, testing if a match exists is redundant and indicates dead code.
 </p>
 </overview>
 <recommendation>
 <p>
 Examine the regular expression and determine how it was intended to match:
 <ul>
 <li>To match the whole input string, add anchors at the beginning and end of the regular expression.</li>
 <li>To search for an occurrence within the input string, consider what the shortest meaningful match is and restrict the
 regular expression accordingly, such as by changing a <code>*</code> to a <code>+</code>.</li>
 </ul>
 </p>
 </recommendation>
 <example>
 <p>
 In the following example, a regular expression is used to check the format of an string <code>id</code>.
 However, the check always passes because the regular expression can match the empty substring.
 For example, it will allow the ID string "<code>%%</code>" by matching an empty string at index 0.
 </p>
 <sample src="examples/RegExpAlwaysMatches.js" />
 <p>
 To ensure the regular expression matches the whole string, add anchors at the beginning and end:
 </p>
 <sample src="examples/RegExpAlwaysMatchesGood.js" />
 </example>
 <references>
 <li>Mozilla Developer Network: <a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions">JavaScript Regular Expressions</a>.</li>
 </references>
 </qhelp>
--- a/javascript/ql/src/RegExp/RegExpAlwaysMatches.ql
+++ b/javascript/ql/src/RegExp/RegExpAlwaysMatches.ql
@@ -0,0 +1,117 @@
 /**
 * @name Regular expression always matches
 * @description Regular expression tests checks always find a match indicate dead code or a logic error
 * @kind problem
 * @problem.severity warning
 * @id js/regex/always-matches
 * @tags correctness
 *       regular-expressions
 * @precision high
 */
 import javascript
 /**
 * Gets a node reachable from the given root term through alts and groups only.
 *
 * For example, for `/(foo|bar)/` this gets `(foo|bar)`, `foo|bar`, `foo` and `bar`.
 */
 RegExpTerm getEffectiveRootAux(RegExpTerm actualRoot) {
  actualRoot.isRootTerm() and
  result = actualRoot
  or
  result = getEffectiveRootAux(actualRoot).(RegExpAlt).getAChild()
  or
  result = getEffectiveRootAux(actualRoot).(RegExpGroup).getAChild()
 }
 /**
 * Gets the effective root of the given term.
 *
 * For example, for `/(foo|bar)/` this gets `foo` and `bar`.
 */
 RegExpTerm getEffectiveRoot(RegExpTerm actualRoot) {
  result = getEffectiveRootAux(actualRoot) and
  not result instanceof RegExpAlt and
  not result instanceof RegExpGroup
 }
 /**
 * Holds if `term` contains an anchor on both ends.
 */
 predicate isPossiblyAnchoredOnBothEnds(RegExpSequence node) {
  node.getAChild*() instanceof RegExpCaret and
  node.getAChild*() instanceof RegExpDollar and
  node.getNumChild() >= 2
 }
 /**
 * Holds if `term` is obviously intended to match any string.
 */
 predicate isUniversalRegExp(RegExpTerm term) {
  exists(RegExpTerm child | child = term.(RegExpStar).getAChild() |
    child instanceof RegExpDot
    or
    child.(RegExpCharacterClass).isUniversalClass()
  )
 }
 /**
 * A call that searches for a regexp match within a string, but does not
 * extract the capture groups or the matched string itself.
 *
 * Because of the longest-match rule, queries that are more than pure tests
 * aren't necessarily broken just because the regexp can accept the empty string.
 */
 abstract class RegExpQuery extends DataFlow::CallNode {
  abstract RegExpTerm getRegExp();
 }
 /**
 * A call to `RegExp.prototype.test`.
 */
 class RegExpTestCall extends DataFlow::MethodCallNode, RegExpQuery {
  DataFlow::RegExpCreationNode regexp;
  RegExpTestCall() {
    this = regexp.getAReference().getAMethodCall("test")
  }
  override RegExpTerm getRegExp() {
    result = regexp.getRoot()
  }
 }
 /**
 * A call to `String.prototype.search`.
 */
 class RegExpSearchCall extends DataFlow::MethodCallNode, RegExpQuery {
  DataFlow::RegExpCreationNode regexp;
  RegExpSearchCall() {
    getMethodName() = "search" and
    regexp.getAReference().flowsTo(getArgument(0))
  }
  override RegExpTerm getRegExp() {
    result = regexp.getRoot()
  }
 }
 from RegExpTerm term, RegExpQuery call, string message
 where
  term.isNullable() and
  not term.getAChild() instanceof RegExpSubPattern and
  not isUniversalRegExp(term) and
  term = getEffectiveRoot(call.getRegExp()) and
  (
    call instanceof RegExpTestCall and
    not isPossiblyAnchoredOnBothEnds(term) and
    message = "This regular expression always matches when used in a test $@, as it can match an empty substring."
    or
    call instanceof RegExpSearchCall and
    not term.getAChild*() instanceof RegExpDollar and
    not term.getAChild*() instanceof RegExpSubPattern and
    message = "This regular expression always the matches at index 0 when used $@, as it matches the empty substring."
  )
 select term, message, call, "here"
--- a/javascript/ql/src/RegExp/examples/RegExpAlwaysMatches.js
+++ b/javascript/ql/src/RegExp/examples/RegExpAlwaysMatches.js
@@ -0,0 +1,3 @@
 if (!/[a-z0-9]*/.test(id)) {
    throw new Error("Invalid id: " + id);
 }
--- a/javascript/ql/src/RegExp/examples/RegExpAlwaysMatchesGood.js
+++ b/javascript/ql/src/RegExp/examples/RegExpAlwaysMatchesGood.js
@@ -0,0 +1,3 @@
 if (!/^[a-z0-9]*$/.test(id)) {
    throw new Error("Invalid id: " + id);
 }
--- a/javascript/ql/src/semmle/javascript/Regexp.qll
+++ b/javascript/ql/src/semmle/javascript/Regexp.qll
@@ -764,6 +764,23 @@ class RegExpCharacterClass extends RegExpTerm, @regexp_char_class {
  override string getAMatchedString() {
    not isInverted() and result = getAChild().getAMatchedString()
  }
  /**
   * Holds if this character class matches any character.
   */
  predicate isUniversalClass() {
    // [^]
    isInverted() and not exists(getAChild())
    or
    // [\w\W] and similar
    not isInverted() and
    exists(string cce1, string cce2 |
      cce1 = getAChild().(RegExpCharacterClassEscape).getValue() and
      cce2 = getAChild().(RegExpCharacterClassEscape).getValue()
    |
      cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase()
    )
  }
 }
 /**
--- a/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.expected
+++ b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.expected
@@ -0,0 +1,5 @@
 | tst.js:2:11:2:20 | ^(https:)? | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:2:10:2:29 | /^(https:)?/.test(x) | here |
 | tst.js:14:11:14:19 | (\\.com)?$ | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:14:10:14:28 | /(\\.com)?$/.test(x) | here |
 | tst.js:22:11:22:34 | ^(?:https?:\|ftp:\|file:)? | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:22:10:22:43 | /^(?:ht ... test(x) | here |
 | tst.js:30:11:30:20 | (foo\|bar)? | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:30:10:30:29 | /(foo\|bar)?/.test(x) | here |
 | tst.js:34:21:34:26 | (baz)? | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:34:10:34:35 | /^foo\|b ... test(x) | here |
--- a/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.qlref
+++ b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.qlref
@@ -0,0 +1 @@
 RegExp/RegExpAlwaysMatches.ql
--- a/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/tst.js
+++ b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/tst.js
@@ -0,0 +1,67 @@
 function optionalPrefix(x) {
  return /^(https:)?/.test(x); // NOT OK
 }
 function mandatoryPrefix(x) {
  return /^https:/.test(x); // NOT OK
 }
 function httpOrHttps(x) {
  return /^https?:/.test(x); // OK
 }
 function optionalSuffix(x) {
  return /(\.com)?$/.test(x); // NOT OK
 }
 function mandatorySuffix(x) {
  return /\.com$/.test(x); // OK
 }
 function protocol(x) {
  return /^(?:https?:|ftp:|file:)?/.test(x); // NOT OK
 }
 function doubleAnchored(x) {
  return /^(foo|bar)?$/.test(x); // OK
 }
 function noAnchor(x) {
  return /(foo|bar)?/.test(x); // NOT OK
 }
 function altAnchor(x) {
  return /^foo|bar$|(baz)?/.test(x); // NOT OK
 }
 function wildcard(x) {
  return /.*/.test(x); // OK - obviously intended to match anything
 }
 function wildcard2(x) {
  return /[\d\D]*/.test(x); // OK - obviously intended to match anything
 }
 function emptyAlt(x) {
  return /^$|foo|bar/.test(x); // OK
 }
 function emptyAlt2(x) {
  return /(^$|foo|bar)/.test(x); // OK
 }
 function emptyAlt3(x) {
  return /((^$|foo|bar))/.test(x); // OK
 }
 function search(x) {
  return /[a-z]*/.search(x); // NOT OK
 }
 function search2(x) {
  return /[a-z]/.search(x); // OK
 }
 function lookahead(x) {
  return /(?!x)/.search(x); // OK
 }