diff --git a/change-notes/1.24/analysis-javascript.md b/change-notes/1.24/analysis-javascript.md
index 20dd26299e1..65076c235ff 100644
--- a/change-notes/1.24/analysis-javascript.md
+++ b/change-notes/1.24/analysis-javascript.md
@@ -17,6 +17,7 @@
| **Query** | **Tags** | **Purpose** |
|---------------------------------------------------------------------------------|-------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Cross-site scripting through exception (`js/xss-through-exception`) | security, external/cwe/cwe-079, external/cwe/cwe-116 | Highlights potential XSS vulnerabilities where an exception is written to the DOM. Results are not shown on LGTM by default. |
+| Regular expression always matches (`js/regex/always-matches`) | correctness, regular-expressions | Highlights regular expression checks that trivially succeed by matching an empty substring. |
## Changes to existing queries
diff --git a/javascript/config/suites/javascript/correctness-core b/javascript/config/suites/javascript/correctness-core
index d6b91e83a07..94ca3d8e054 100644
--- a/javascript/config/suites/javascript/correctness-core
+++ b/javascript/config/suites/javascript/correctness-core
@@ -34,6 +34,7 @@
+ semmlecode-javascript-queries/RegExp/DuplicateCharacterInCharacterClass.ql: /Correctness/Regular Expressions
+ semmlecode-javascript-queries/RegExp/EmptyCharacterClass.ql: /Correctness/Regular Expressions
+ semmlecode-javascript-queries/RegExp/IdentityReplacement.ql: /Correctness/Regular Expressions
++ semmlecode-javascript-queries/RegExp/RegExpAlwaysMatches.ql: /Correctness/Regular Expressions
+ semmlecode-javascript-queries/RegExp/UnboundBackref.ql: /Correctness/Regular Expressions
+ semmlecode-javascript-queries/RegExp/UnmatchableCaret.ql: /Correctness/Regular Expressions
+ semmlecode-javascript-queries/RegExp/UnmatchableDollar.ql: /Correctness/Regular Expressions
diff --git a/javascript/ql/src/Performance/ReDoS.ql b/javascript/ql/src/Performance/ReDoS.ql
index 2826306e5d2..7f74dcb47d5 100644
--- a/javascript/ql/src/Performance/ReDoS.ql
+++ b/javascript/ql/src/Performance/ReDoS.ql
@@ -144,7 +144,7 @@ newtype TInputSymbol =
CharClass(RegExpCharacterClass recc) {
getRoot(recc).isRelevant() and
not recc.isInverted() and
- not isUniversalClass(recc)
+ not recc.isUniversalClass()
} or
/** An input symbol representing all characters matched by `.`. */
Dot() or
@@ -153,23 +153,6 @@ newtype TInputSymbol =
/** An epsilon transition in the automaton. */
Epsilon()
-/**
- * Holds if character class `cc` matches all characters.
- */
-predicate isUniversalClass(RegExpCharacterClass cc) {
- // [^]
- cc.isInverted() and not exists(cc.getAChild())
- or
- // [\w\W] and similar
- not cc.isInverted() and
- exists(string cce1, string cce2 |
- cce1 = cc.getAChild().(RegExpCharacterClassEscape).getValue() and
- cce2 = cc.getAChild().(RegExpCharacterClassEscape).getValue()
- |
- cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase()
- )
-}
-
/**
* An abstract input symbol, representing a set of concrete characters.
*/
@@ -361,7 +344,7 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
)
or
exists(RegExpCharacterClass cc |
- isUniversalClass(cc) and q1 = before(cc) and lbl = Any() and q2 = after(cc)
+ cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
or
q1 = before(cc) and lbl = CharClass(cc) and q2 = after(cc)
)
diff --git a/javascript/ql/src/RegExp/RegExpAlwaysMatches.qhelp b/javascript/ql/src/RegExp/RegExpAlwaysMatches.qhelp
new file mode 100644
index 00000000000..cf31c172862
--- /dev/null
+++ b/javascript/ql/src/RegExp/RegExpAlwaysMatches.qhelp
@@ -0,0 +1,54 @@
+
+
+
+
+
+There are several built-in JavaScript functions that search for a regular expression match within a string,
+such as RegExp.prototype.test and String.prototype.search.
+If the regular expression is not anchored, the regular expression does not need to match the whole string;
+it only needs to match a substring.
+
+
+
+If the regular expression being searched for accepts the empty string, this means it can match an empty
+substring anywhere in the input string, and will thus always find a match.
+In this case, testing if a match exists is redundant and indicates dead code.
+
+
+
+
+
+
+Examine the regular expression and determine how it was intended to match:
+
+- To match the whole input string, add anchors at the beginning and end of the regular expression.
+- To search for an occurrence within the input string, consider what the shortest meaningful match is and restrict the
+regular expression accordingly, such as by changing a
* to a +.
+
+
+
+
+
+
+In the following example, a regular expression is used to check the format of an string id.
+However, the check always passes because the regular expression can match the empty substring.
+For example, it will allow the ID string "%%" by matching an empty string at index 0.
+
+
+
+
+
+To ensure the regular expression matches the whole string, add anchors at the beginning and end:
+
+
+
+
+
+
+
+Mozilla Developer Network: JavaScript Regular Expressions.
+
+
+
diff --git a/javascript/ql/src/RegExp/RegExpAlwaysMatches.ql b/javascript/ql/src/RegExp/RegExpAlwaysMatches.ql
new file mode 100644
index 00000000000..b3eee6c6776
--- /dev/null
+++ b/javascript/ql/src/RegExp/RegExpAlwaysMatches.ql
@@ -0,0 +1,117 @@
+/**
+ * @name Regular expression always matches
+ * @description Regular expression tests checks always find a match indicate dead code or a logic error
+ * @kind problem
+ * @problem.severity warning
+ * @id js/regex/always-matches
+ * @tags correctness
+ * regular-expressions
+ * @precision high
+ */
+
+import javascript
+
+/**
+ * Gets a node reachable from the given root term through alts and groups only.
+ *
+ * For example, for `/(foo|bar)/` this gets `(foo|bar)`, `foo|bar`, `foo` and `bar`.
+ */
+RegExpTerm getEffectiveRootAux(RegExpTerm actualRoot) {
+ actualRoot.isRootTerm() and
+ result = actualRoot
+ or
+ result = getEffectiveRootAux(actualRoot).(RegExpAlt).getAChild()
+ or
+ result = getEffectiveRootAux(actualRoot).(RegExpGroup).getAChild()
+}
+
+/**
+ * Gets the effective root of the given term.
+ *
+ * For example, for `/(foo|bar)/` this gets `foo` and `bar`.
+ */
+RegExpTerm getEffectiveRoot(RegExpTerm actualRoot) {
+ result = getEffectiveRootAux(actualRoot) and
+ not result instanceof RegExpAlt and
+ not result instanceof RegExpGroup
+}
+
+/**
+ * Holds if `term` contains an anchor on both ends.
+ */
+predicate isPossiblyAnchoredOnBothEnds(RegExpSequence node) {
+ node.getAChild*() instanceof RegExpCaret and
+ node.getAChild*() instanceof RegExpDollar and
+ node.getNumChild() >= 2
+}
+
+/**
+ * Holds if `term` is obviously intended to match any string.
+ */
+predicate isUniversalRegExp(RegExpTerm term) {
+ exists(RegExpTerm child | child = term.(RegExpStar).getAChild() |
+ child instanceof RegExpDot
+ or
+ child.(RegExpCharacterClass).isUniversalClass()
+ )
+}
+
+/**
+ * A call that searches for a regexp match within a string, but does not
+ * extract the capture groups or the matched string itself.
+ *
+ * Because of the longest-match rule, queries that are more than pure tests
+ * aren't necessarily broken just because the regexp can accept the empty string.
+ */
+abstract class RegExpQuery extends DataFlow::CallNode {
+ abstract RegExpTerm getRegExp();
+}
+
+/**
+ * A call to `RegExp.prototype.test`.
+ */
+class RegExpTestCall extends DataFlow::MethodCallNode, RegExpQuery {
+ DataFlow::RegExpCreationNode regexp;
+
+ RegExpTestCall() {
+ this = regexp.getAReference().getAMethodCall("test")
+ }
+
+ override RegExpTerm getRegExp() {
+ result = regexp.getRoot()
+ }
+}
+
+/**
+ * A call to `String.prototype.search`.
+ */
+class RegExpSearchCall extends DataFlow::MethodCallNode, RegExpQuery {
+ DataFlow::RegExpCreationNode regexp;
+
+ RegExpSearchCall() {
+ getMethodName() = "search" and
+ regexp.getAReference().flowsTo(getArgument(0))
+ }
+
+ override RegExpTerm getRegExp() {
+ result = regexp.getRoot()
+ }
+}
+
+from RegExpTerm term, RegExpQuery call, string message
+where
+ term.isNullable() and
+ not term.getAChild() instanceof RegExpSubPattern and
+ not isUniversalRegExp(term) and
+ term = getEffectiveRoot(call.getRegExp()) and
+ (
+ call instanceof RegExpTestCall and
+ not isPossiblyAnchoredOnBothEnds(term) and
+ message = "This regular expression always matches when used in a test $@, as it can match an empty substring."
+ or
+ call instanceof RegExpSearchCall and
+ not term.getAChild*() instanceof RegExpDollar and
+ not term.getAChild*() instanceof RegExpSubPattern and
+ message = "This regular expression always the matches at index 0 when used $@, as it matches the empty substring."
+ )
+select term, message, call, "here"
diff --git a/javascript/ql/src/RegExp/examples/RegExpAlwaysMatches.js b/javascript/ql/src/RegExp/examples/RegExpAlwaysMatches.js
new file mode 100644
index 00000000000..29e779dab79
--- /dev/null
+++ b/javascript/ql/src/RegExp/examples/RegExpAlwaysMatches.js
@@ -0,0 +1,3 @@
+if (!/[a-z0-9]*/.test(id)) {
+ throw new Error("Invalid id: " + id);
+}
diff --git a/javascript/ql/src/RegExp/examples/RegExpAlwaysMatchesGood.js b/javascript/ql/src/RegExp/examples/RegExpAlwaysMatchesGood.js
new file mode 100644
index 00000000000..878f8082f78
--- /dev/null
+++ b/javascript/ql/src/RegExp/examples/RegExpAlwaysMatchesGood.js
@@ -0,0 +1,3 @@
+if (!/^[a-z0-9]*$/.test(id)) {
+ throw new Error("Invalid id: " + id);
+}
diff --git a/javascript/ql/src/semmle/javascript/Regexp.qll b/javascript/ql/src/semmle/javascript/Regexp.qll
index ae6c541b21c..29e5a1bae3e 100644
--- a/javascript/ql/src/semmle/javascript/Regexp.qll
+++ b/javascript/ql/src/semmle/javascript/Regexp.qll
@@ -764,6 +764,23 @@ class RegExpCharacterClass extends RegExpTerm, @regexp_char_class {
override string getAMatchedString() {
not isInverted() and result = getAChild().getAMatchedString()
}
+
+ /**
+ * Holds if this character class matches any character.
+ */
+ predicate isUniversalClass() {
+ // [^]
+ isInverted() and not exists(getAChild())
+ or
+ // [\w\W] and similar
+ not isInverted() and
+ exists(string cce1, string cce2 |
+ cce1 = getAChild().(RegExpCharacterClassEscape).getValue() and
+ cce2 = getAChild().(RegExpCharacterClassEscape).getValue()
+ |
+ cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase()
+ )
+ }
}
/**
diff --git a/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.expected b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.expected
new file mode 100644
index 00000000000..1a15b8702d3
--- /dev/null
+++ b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.expected
@@ -0,0 +1,5 @@
+| tst.js:2:11:2:20 | ^(https:)? | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:2:10:2:29 | /^(https:)?/.test(x) | here |
+| tst.js:14:11:14:19 | (\\.com)?$ | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:14:10:14:28 | /(\\.com)?$/.test(x) | here |
+| tst.js:22:11:22:34 | ^(?:https?:\|ftp:\|file:)? | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:22:10:22:43 | /^(?:ht ... test(x) | here |
+| tst.js:30:11:30:20 | (foo\|bar)? | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:30:10:30:29 | /(foo\|bar)?/.test(x) | here |
+| tst.js:34:21:34:26 | (baz)? | This regular expression always matches when used in a test $@, as it can match an empty substring. | tst.js:34:10:34:35 | /^foo\|b ... test(x) | here |
diff --git a/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.qlref b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.qlref
new file mode 100644
index 00000000000..acdde814bbc
--- /dev/null
+++ b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/RegExpAlwaysMatches.qlref
@@ -0,0 +1 @@
+RegExp/RegExpAlwaysMatches.ql
\ No newline at end of file
diff --git a/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/tst.js b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/tst.js
new file mode 100644
index 00000000000..4f03b5ff70f
--- /dev/null
+++ b/javascript/ql/test/query-tests/RegExp/RegExpAlwaysMatches/tst.js
@@ -0,0 +1,67 @@
+function optionalPrefix(x) {
+ return /^(https:)?/.test(x); // NOT OK
+}
+
+function mandatoryPrefix(x) {
+ return /^https:/.test(x); // NOT OK
+}
+
+function httpOrHttps(x) {
+ return /^https?:/.test(x); // OK
+}
+
+function optionalSuffix(x) {
+ return /(\.com)?$/.test(x); // NOT OK
+}
+
+function mandatorySuffix(x) {
+ return /\.com$/.test(x); // OK
+}
+
+function protocol(x) {
+ return /^(?:https?:|ftp:|file:)?/.test(x); // NOT OK
+}
+
+function doubleAnchored(x) {
+ return /^(foo|bar)?$/.test(x); // OK
+}
+
+function noAnchor(x) {
+ return /(foo|bar)?/.test(x); // NOT OK
+}
+
+function altAnchor(x) {
+ return /^foo|bar$|(baz)?/.test(x); // NOT OK
+}
+
+function wildcard(x) {
+ return /.*/.test(x); // OK - obviously intended to match anything
+}
+
+function wildcard2(x) {
+ return /[\d\D]*/.test(x); // OK - obviously intended to match anything
+}
+
+function emptyAlt(x) {
+ return /^$|foo|bar/.test(x); // OK
+}
+
+function emptyAlt2(x) {
+ return /(^$|foo|bar)/.test(x); // OK
+}
+
+function emptyAlt3(x) {
+ return /((^$|foo|bar))/.test(x); // OK
+}
+
+function search(x) {
+ return /[a-z]*/.search(x); // NOT OK
+}
+
+function search2(x) {
+ return /[a-z]/.search(x); // OK
+}
+
+function lookahead(x) {
+ return /(?!x)/.search(x); // OK
+}