support case insensitive regexps in the ReDoS queries

2025-12-16 16:53:25 +01:00 · 2021-08-29 12:03:27 +02:00
parent bf15b18f22
commit f5a1a12435
9 changed files with 233 additions and 20 deletions
--- a/config/identical-files.json
+++ b/config/identical-files.json
@@ -462,4 +462,4 @@
    "javascript/ql/lib/semmle/javascript/security/performance/SuperlinearBackTracking.qll",
    "python/ql/lib/semmle/python/security/performance/SuperlinearBackTracking.qll"
  ]
-}
+}
--- a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll
+++ b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll
@@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm {

 /**
 * Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
+ * The string representation includes which flags are used with the regular expression.
 *
 * Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
 * The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
@@ -173,26 +174,54 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
    min(RelevantRegExpTerm t, Location loc, File file |
      loc = t.getLocation() and
      file = t.getFile() and
-      str = t.getRawValue()
+      str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
    |
      t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
    )
 }

+/**
+ * Gets a string reperesentation of the flags used with the regular expression.
+ * Only the flags that are relevant for the canonicalization are included.
+ */
+string getCanonicalizationFlags(RegExpTerm root) {
+  root.isRootTerm() and
+  (
+    RegExpFlags::isIgnoreCase(root) and
+    result = "i"
+    or
+    not RegExpFlags::isIgnoreCase(root) and
+    result = ""
+  )
+}
+
 /**
 * An abstract input symbol, representing a set of concrete characters.
 */
 private newtype TInputSymbol =
  /** An input symbol corresponding to character `c`. */
  Char(string c) {
-    c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_)
+    c =
+      any(RegexpCharacterConstant cc |
+        cc instanceof RelevantRegExpTerm and
+        not RegExpFlags::isIgnoreCase(cc.getRootTerm())
+      ).getValue().charAt(_)
+    or
+    // normalize to lower case if the regexp is case insensitive
+    c =
+      any(RegexpCharacterConstant cc, string char |
+        cc instanceof RelevantRegExpTerm and
+        RegExpFlags::isIgnoreCase(cc.getRootTerm()) and
+        char = cc.getValue().charAt(_)
+      |
+        char.toLowerCase()
+      )
  } or
  /**
   * An input symbol representing all characters matched by
   * a (non-universal) character class that has string representation `charClassString`.
   */
  CharClass(string charClassString) {
-    exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and
    exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
      recc instanceof RegExpCharacterClass and
      not recc.(RegExpCharacterClass).isUniversalClass()
@@ -293,6 +322,19 @@ private module CharacterClasses {
   */
  pragma[noinline]
  predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
+    if RegExpFlags::isIgnoreCase(cc.getRootTerm())
+    then
+      // normalize everything to lower case if the regexp is case insensitive
+      exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase())
+    else hasChildThatMatchesIgnoringCasing(cc, char)
+  }
+
+  /**
+   * Holds if the character class `cc` has a child (constant or range) that matches `char`.
+   * Ignores whether the character class is inside a regular expression that ignores casing.
+   */
+  pragma[noinline]
+  predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) {
    exists(getCanonicalCharClass(cc)) and
    exists(RegExpTerm child | child = cc.getAChild() |
      char = child.(RegexpCharacterConstant).getValue()
@@ -508,7 +550,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
 /**
 * Gets a state the NFA may be in after matching `t`.
 */
-private State after(RegExpTerm t) {
+State after(RegExpTerm t) {
  exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
  or
  exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@@ -537,7 +579,14 @@ private State after(RegExpTerm t) {
 predicate delta(State q1, EdgeLabel lbl, State q2) {
  exists(RegexpCharacterConstant s, int i |
    q1 = Match(s, i) and
-    lbl = Char(s.getValue().charAt(i)) and
+    (
+      not RegExpFlags::isIgnoreCase(s.getRootTerm()) and
+      lbl = Char(s.getValue().charAt(i))
+      or
+      // normalizing to lower case if ignorecase flag is set
+      RegExpFlags::isIgnoreCase(s.getRootTerm()) and
+      exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
+    ) and
    (
      q2 = Match(s, i + 1)
      or
@@ -547,20 +596,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
  )
  or
  exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
-    if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
+    if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot()
  )
  or
  exists(RegExpCharacterClass cc |
    cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
    or
    q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue()) and
+    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
    q2 = after(cc)
  )
  or
  exists(RegExpCharacterClassEscape cc |
    q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue()) and
+    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
    q2 = after(cc)
  )
  or
@@ -627,13 +676,27 @@ RegExpRoot getRoot(RegExpTerm term) {
  result = getRoot(term.getParent())
 }

-private newtype TState =
+/**
+ * A state in the NFA.
+ */
+newtype TState =
+  /**
+   * A state representing that the NFA is about to match a term.
+   * `i` is used to index into multi-char literals.
+   */
  Match(RelevantRegExpTerm t, int i) {
    i = 0
    or
    exists(t.(RegexpCharacterConstant).getValue().charAt(i))
  } or
+  /**
+   * An accept state, where exactly the given input string is accepted.
+   */
  Accept(RegExpRoot l) { l.isRelevant() } or
+  /**
+   * An accept state, where the given input string, or any string that has this
+   * string as a prefix, is accepted.
+   */
  AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }

 /**
--- a/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll
+++ b/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll
@@ -12,3 +12,41 @@ import javascript
 * For javascript we make the pragmatic performance optimization to ignore minified files.
 */
 predicate isExcluded(RegExpParent parent) { parent.(Expr).getTopLevel().isMinified() }
+
+/**
+ * A module containing predicates for determining which flags a regular expression have.
+ */
+module RegExpFlags {
+  /**
+   * Holds if `root` has the `i` flag for case-insensitive matching.
+   */
+  predicate isIgnoreCase(RegExpTerm root) {
+    root.isRootTerm() and
+    exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
+      RegExp::isIgnoreCase(node.getFlags())
+    )
+  }
+
+  /**
+   * Gets the flags for `root`, or the empty string if `root` has no flags.
+   */
+  string getFlags(RegExpTerm root) {
+    root.isRootTerm() and
+    exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
+      result = node.getFlags()
+      or
+      not exists(node.getFlags()) and
+      result = ""
+    )
+  }
+
+  /**
+   * Holds if `root` has the `s` flag for multi-line matching.
+   */
+  predicate isDotAll(RegExpTerm root) {
+    root.isRootTerm() and
+    exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
+      RegExp::isDotAll(node.getFlags())
+    )
+  }
+}
--- a/javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected
+++ b/javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected
@@ -502,3 +502,7 @@
 | tst.js:375:15:375:16 | x* | Strings with many repetitions of 'x' can start matching anywhere after the start of the preceeding (x*)+(?=$\|y) |
 | tst.js:378:16:378:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$) |
 | tst.js:379:16:379:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$\|y) |
+| tst.js:381:15:381:24 | (foo\|FOO)* | Strings with many repetitions of 'FOO' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar |
+| tst.js:382:14:382:23 | (foo\|FOO)* | Strings with many repetitions of 'foo' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar |
+| tst.js:384:15:384:26 | ([AB]\|[ab])* | Strings with many repetitions of 'A' can start matching anywhere after the start of the preceeding ([AB]\|[ab])*C |
+| tst.js:385:14:385:25 | ([DE]\|[de])* | Strings with many repetitions of 'd' can start matching anywhere after the start of the preceeding ([DE]\|[de])*F |
--- a/javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected
+++ b/javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected
@@ -178,3 +178,5 @@
 | tst.js:375:15:375:16 | x* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'x'. |
 | tst.js:378:16:378:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
 | tst.js:379:16:379:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
+| tst.js:382:14:382:23 | (foo\|FOO)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foo'. |
+| tst.js:385:14:385:25 | ([DE]\|[de])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'd'. |
--- a/javascript/ql/test/query-tests/Performance/ReDoS/tst.js
+++ b/javascript/ql/test/query-tests/Performance/ReDoS/tst.js
@@ -376,4 +376,10 @@ var bad90 = /(x*)+(?=$|y)/

 // GOOD - but we spuriously conclude that a rejecting suffix exists.
 var good44 = /([\s\S]*)+(?=$)/;
-var good45 = /([\s\S]*)+(?=$|y)/;
+var good45 = /([\s\S]*)+(?=$|y)/;
+
+var good46 = /(foo|FOO)*bar/;
+var bad91 = /(foo|FOO)*bar/i;
+
+var good47 = /([AB]|[ab])*C/;
+var bad92 = /([DE]|[de])*F/i;
--- a/python/ql/lib/semmle/python/RegexTreeView.qll
+++ b/python/ql/lib/semmle/python/RegexTreeView.qll
@@ -61,6 +61,14 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent {

  predicate isDotAll() { re.getAMode() = "DOTALL" }

+  predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" }
+
+  string getFlags() {
+    not exists(re.getAMode()) and result = ""
+    or
+    result = strictconcat(string mode | mode = re.getAMode() | mode, " | ")
+  }
+
  override Regex getRegex() { result = re }

  string getPrimaryQLClass() { result = "RegExpLiteral" }
--- a/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll
+++ b/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll
@@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm {

 /**
 * Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
+ * The string representation includes which flags are used with the regular expression.
 *
 * Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
 * The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
@@ -173,26 +174,54 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
    min(RelevantRegExpTerm t, Location loc, File file |
      loc = t.getLocation() and
      file = t.getFile() and
-      str = t.getRawValue()
+      str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
    |
      t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
    )
 }

+/**
+ * Gets a string reperesentation of the flags used with the regular expression.
+ * Only the flags that are relevant for the canonicalization are included.
+ */
+string getCanonicalizationFlags(RegExpTerm root) {
+  root.isRootTerm() and
+  (
+    RegExpFlags::isIgnoreCase(root) and
+    result = "i"
+    or
+    not RegExpFlags::isIgnoreCase(root) and
+    result = ""
+  )
+}
+
 /**
 * An abstract input symbol, representing a set of concrete characters.
 */
 private newtype TInputSymbol =
  /** An input symbol corresponding to character `c`. */
  Char(string c) {
-    c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_)
+    c =
+      any(RegexpCharacterConstant cc |
+        cc instanceof RelevantRegExpTerm and
+        not RegExpFlags::isIgnoreCase(cc.getRootTerm())
+      ).getValue().charAt(_)
+    or
+    // normalize to lower case if the regexp is case insensitive
+    c =
+      any(RegexpCharacterConstant cc, string char |
+        cc instanceof RelevantRegExpTerm and
+        RegExpFlags::isIgnoreCase(cc.getRootTerm()) and
+        char = cc.getValue().charAt(_)
+      |
+        char.toLowerCase()
+      )
  } or
  /**
   * An input symbol representing all characters matched by
   * a (non-universal) character class that has string representation `charClassString`.
   */
  CharClass(string charClassString) {
-    exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and
    exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
      recc instanceof RegExpCharacterClass and
      not recc.(RegExpCharacterClass).isUniversalClass()
@@ -293,6 +322,19 @@ private module CharacterClasses {
   */
  pragma[noinline]
  predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
+    if RegExpFlags::isIgnoreCase(cc.getRootTerm())
+    then
+      // normalize everything to lower case if the regexp is case insensitive
+      exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase())
+    else hasChildThatMatchesIgnoringCasing(cc, char)
+  }
+
+  /**
+   * Holds if the character class `cc` has a child (constant or range) that matches `char`.
+   * Ignores whether the character class is inside a regular expression that ignores casing.
+   */
+  pragma[noinline]
+  predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) {
    exists(getCanonicalCharClass(cc)) and
    exists(RegExpTerm child | child = cc.getAChild() |
      char = child.(RegexpCharacterConstant).getValue()
@@ -508,7 +550,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
 /**
 * Gets a state the NFA may be in after matching `t`.
 */
-private State after(RegExpTerm t) {
+State after(RegExpTerm t) {
  exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
  or
  exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@@ -537,7 +579,14 @@ private State after(RegExpTerm t) {
 predicate delta(State q1, EdgeLabel lbl, State q2) {
  exists(RegexpCharacterConstant s, int i |
    q1 = Match(s, i) and
-    lbl = Char(s.getValue().charAt(i)) and
+    (
+      not RegExpFlags::isIgnoreCase(s.getRootTerm()) and
+      lbl = Char(s.getValue().charAt(i))
+      or
+      // normalizing to lower case if ignorecase flag is set
+      RegExpFlags::isIgnoreCase(s.getRootTerm()) and
+      exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
+    ) and
    (
      q2 = Match(s, i + 1)
      or
@@ -547,20 +596,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
  )
  or
  exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
-    if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
+    if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot()
  )
  or
  exists(RegExpCharacterClass cc |
    cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
    or
    q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue()) and
+    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
    q2 = after(cc)
  )
  or
  exists(RegExpCharacterClassEscape cc |
    q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue()) and
+    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
    q2 = after(cc)
  )
  or
@@ -627,13 +676,27 @@ RegExpRoot getRoot(RegExpTerm term) {
  result = getRoot(term.getParent())
 }

-private newtype TState =
+/**
+ * A state in the NFA.
+ */
+newtype TState =
+  /**
+   * A state representing that the NFA is about to match a term.
+   * `i` is used to index into multi-char literals.
+   */
  Match(RelevantRegExpTerm t, int i) {
    i = 0
    or
    exists(t.(RegexpCharacterConstant).getValue().charAt(i))
  } or
+  /**
+   * An accept state, where exactly the given input string is accepted.
+   */
  Accept(RegExpRoot l) { l.isRelevant() } or
+  /**
+   * An accept state, where the given input string, or any string that has this
+   * string as a prefix, is accepted.
+   */
  AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }

 /**
--- a/python/ql/lib/semmle/python/security/performance/RegExpTreeView.qll
+++ b/python/ql/lib/semmle/python/security/performance/RegExpTreeView.qll
@@ -18,3 +18,32 @@ predicate isExcluded(RegExpParent parent) {
  // we explicitly exclude these.
  count(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10
 }
+
+/**
+ * A module containing predicates for determining which flags a regular expression have.
+ */
+module RegExpFlags {
+  /**
+   * Holds if `root` has the `i` flag for case-insensitive matching.
+   */
+  predicate isIgnoreCase(RegExpTerm root) {
+    root.isRootTerm() and
+    root.getLiteral().isIgnoreCase()
+  }
+
+  /**
+   * Gets the flags for `root`, or the empty string if `root` has no flags.
+   */
+  string getFlags(RegExpTerm root) {
+    root.isRootTerm() and
+    result = root.getLiteral().getFlags()
+  }
+
+  /**
+   * Holds if `root` has the `s` flag for multi-line matching.
+   */
+  predicate isDotAll(RegExpTerm root) {
+    root.isRootTerm() and
+    root.getLiteral().isDotAll()
+  }
+}