Merge pull request #2593 from asger-semmle/regexp-always-matches

JS: Add RegExpAlwaysMatches query
2026-04-28 18:25:24 +02:00 · 2020-01-08 15:21:39 +00:00
parent 8e700081f1 775e63d9c0
commit de15ecf47b
11 changed files with 303 additions and 19 deletions
--- a/javascript/ql/src/Performance/ReDoS.ql
+++ b/javascript/ql/src/Performance/ReDoS.ql
@@ -144,7 +144,7 @@ newtype TInputSymbol =
  CharClass(RegExpCharacterClass recc) {
    getRoot(recc).isRelevant() and
    not recc.isInverted() and
-    not isUniversalClass(recc)
+    not recc.isUniversalClass()
  } or
  /** An input symbol representing all characters matched by `.`. */
  Dot() or
@@ -153,23 +153,6 @@ newtype TInputSymbol =
  /** An epsilon transition in the automaton. */
  Epsilon()

-/**
- * Holds if character class `cc` matches all characters.
- */
-predicate isUniversalClass(RegExpCharacterClass cc) {
-  // [^]
-  cc.isInverted() and not exists(cc.getAChild())
-  or
-  // [\w\W] and similar
-  not cc.isInverted() and
-  exists(string cce1, string cce2 |
-    cce1 = cc.getAChild().(RegExpCharacterClassEscape).getValue() and
-    cce2 = cc.getAChild().(RegExpCharacterClassEscape).getValue()
-  |
-    cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase()
-  )
-}
-
 /**
 * An abstract input symbol, representing a set of concrete characters.
 */
@@ -361,7 +344,7 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
  )
  or
  exists(RegExpCharacterClass cc |
-    isUniversalClass(cc) and q1 = before(cc) and lbl = Any() and q2 = after(cc)
+    cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
    or
    q1 = before(cc) and lbl = CharClass(cc) and q2 = after(cc)
  )
--- a/javascript/ql/src/RegExp/RegExpAlwaysMatches.qhelp
+++ b/javascript/ql/src/RegExp/RegExpAlwaysMatches.qhelp
@@ -0,0 +1,55 @@
+<!DOCTYPE qhelp PUBLIC
+  "-//Semmle//qhelp//EN"
+  "qhelp.dtd">
+<qhelp>
+
+<overview>
+<p>
+There are several built-in JavaScript functions that search for a regular expression match within a string,
+such as <code>RegExp.prototype.test</code> and <code>String.prototype.search</code>.
+If the regular expression is not anchored, it only needs to match a substring of the input
+and won't necessarily match the whole string.
+</p>
+
+<p>
+If the regular expression being searched for accepts the empty string, this means it can match an empty
+substring anywhere in the input string, and will thus always find a match.
+In this case, testing if a match exists is redundant and indicates dead code.
+</p>
+
+</overview>
+<recommendation>
+
+<p>
+Examine the regular expression and determine how it was intended to match:
+</p>
+
+<ul>
+<li>To match the whole input string, add anchors at the beginning and end of the regular expression.</li>
+<li>To search for an occurrence within the input string, consider what the shortest meaningful match is and restrict the
+regular expression accordingly, such as by changing a <code>*</code> to a <code>+</code>.</li>
+</ul>
+
+</recommendation>
+<example>
+<p>
+In the following example, a regular expression is used to check the format of a string <code>id</code>.
+However, the check always passes because the regular expression can match the empty substring.
+For example, it will allow the ID string "<code>%%</code>" by matching an empty string at index 0.
+</p>
+
+<sample src="examples/RegExpAlwaysMatches.js" />
+
+<p>
+To ensure the regular expression matches the whole string, add anchors at the beginning and end:
+</p>
+
+<sample src="examples/RegExpAlwaysMatchesGood.js" />
+
+</example>
+<references>
+
+<li>Mozilla Developer Network: <a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions">JavaScript Regular Expressions</a>.</li>
+
+</references>
+</qhelp>
--- a/javascript/ql/src/RegExp/RegExpAlwaysMatches.ql
+++ b/javascript/ql/src/RegExp/RegExpAlwaysMatches.ql
@@ -0,0 +1,125 @@
+/**
+ * @name Regular expression always matches
+ * @description Regular expression tests that always find a match indicate dead code or a logic error
+ * @kind problem
+ * @problem.severity warning
+ * @id js/regex/always-matches
+ * @tags correctness
+ *       regular-expressions
+ * @precision high
+ */
+
+import javascript
+
+/**
+ * Gets a node reachable from the given root term through alts and groups only.
+ *
+ * For example, for `/(foo|bar)/` this gets `(foo|bar)`, `foo|bar`, `foo` and `bar`.
+ */
+RegExpTerm getEffectiveRootAux(RegExpTerm actualRoot) {
+  actualRoot.isRootTerm() and
+  result = actualRoot
+  or
+  result = getEffectiveRootAux(actualRoot).(RegExpAlt).getAChild()
+  or
+  result = getEffectiveRootAux(actualRoot).(RegExpGroup).getAChild()
+}
+
+/**
+ * Gets the effective root of the given term.
+ *
+ * For example, for `/(foo|bar)/` this gets `foo` and `bar`.
+ */
+RegExpTerm getEffectiveRoot(RegExpTerm actualRoot) {
+  result = getEffectiveRootAux(actualRoot) and
+  not result instanceof RegExpAlt and
+  not result instanceof RegExpGroup
+}
+
+/**
+ * Holds if `term` contains an anchor on both ends.
+ */
+predicate isPossiblyAnchoredOnBothEnds(RegExpSequence node) {
+  node.getAChild*() instanceof RegExpCaret and
+  node.getAChild*() instanceof RegExpDollar and
+  node.getNumChild() >= 2
+}
+
+/**
+ * Holds if `term` is obviously intended to match any string.
+ */
+predicate isUniversalRegExp(RegExpTerm term) {
+  exists(RegExpTerm child | child = term.(RegExpStar).getAChild() |
+    child instanceof RegExpDot
+    or
+    child.(RegExpCharacterClass).isUniversalClass()
+  )
+}
+
+/**
+ * A call that searches for a regexp match within a string, but does not
+ * extract the capture groups or the matched string itself.
+ *
+ * Because of the longest-match rule, queries that are more than pure tests
+ * aren't necessarily broken just because the regexp can accept the empty string.
+ */
+abstract class RegExpQuery extends DataFlow::CallNode {
+  abstract RegExpTerm getRegExp();
+}
+
+/**
+ * A call to `RegExp.prototype.test`.
+ */
+class RegExpTestCall extends DataFlow::MethodCallNode, RegExpQuery {
+  DataFlow::RegExpCreationNode regexp;
+
+  RegExpTestCall() {
+    this = regexp.getAReference().getAMethodCall("test")
+  }
+
+  override RegExpTerm getRegExp() {
+    result = regexp.getRoot()
+  }
+}
+
+/**
+ * A call to `String.prototype.search`.
+ */
+class RegExpSearchCall extends DataFlow::MethodCallNode, RegExpQuery {
+  DataFlow::RegExpCreationNode regexp;
+
+  RegExpSearchCall() {
+    getMethodName() = "search" and
+    regexp.getAReference().flowsTo(getArgument(0))
+  }
+
+  override RegExpTerm getRegExp() {
+    result = regexp.getRoot()
+  }
+}
+
+/**
+ * Holds if `t` is a zero-width assertion other than an anchor.
+ */
+predicate isAssertion(RegExpTerm t) {
+  t instanceof RegExpSubPattern or
+  t instanceof RegExpWordBoundary or
+  t instanceof RegExpNonWordBoundary
+}
+
+from RegExpTerm term, RegExpQuery call, string message
+where
+  term.isNullable() and
+  not isAssertion(term.getAChild*()) and
+  not isUniversalRegExp(term) and
+  term = getEffectiveRoot(call.getRegExp()) and
+  (
+    call instanceof RegExpTestCall and
+    not isPossiblyAnchoredOnBothEnds(term) and
+    message = "This regular expression always matches when used in a test $@, as it can match an empty substring."
+    or
+    call instanceof RegExpSearchCall and
+    not term.getAChild*() instanceof RegExpDollar and
+    message = "This regular expression always the matches at index 0 when used $@, as it matches the empty substring."
+  )
+select term, message, call, "here"
--- a/javascript/ql/src/RegExp/examples/RegExpAlwaysMatches.js
+++ b/javascript/ql/src/RegExp/examples/RegExpAlwaysMatches.js
@@ -0,0 +1,3 @@
+if (!/[a-z0-9]*/.test(id)) {
+    throw new Error("Invalid id: " + id);
+}
--- a/javascript/ql/src/RegExp/examples/RegExpAlwaysMatchesGood.js
+++ b/javascript/ql/src/RegExp/examples/RegExpAlwaysMatchesGood.js
@@ -0,0 +1,3 @@
+if (!/^[a-z0-9]*$/.test(id)) {
+    throw new Error("Invalid id: " + id);
+}
--- a/javascript/ql/src/semmle/javascript/Regexp.qll
+++ b/javascript/ql/src/semmle/javascript/Regexp.qll
@@ -764,6 +764,23 @@ class RegExpCharacterClass extends RegExpTerm, @regexp_char_class {
  override string getAMatchedString() {
    not isInverted() and result = getAChild().getAMatchedString()
  }
+
+  /**
+   * Holds if this character class matches any character.
+   */
+  predicate isUniversalClass() {
+    // [^]
+    isInverted() and not exists(getAChild())
+    or
+    // [\w\W] and similar
+    not isInverted() and
+    exists(string cce1, string cce2 |
+      cce1 = getAChild().(RegExpCharacterClassEscape).getValue() and
+      cce2 = getAChild().(RegExpCharacterClassEscape).getValue()
+    |
+      cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase()
+    )
+  }
 }

 /**