From f5a1a124350e5363c4267cdd82f298b93a0d108f Mon Sep 17 00:00:00 2001
From: Erik Krogh Kristensen <erik-krogh@github.com>
Date: Sun, 29 Aug 2021 12:03:27 +0200
Subject: [PATCH] support case insensitive regexps in the ReDoS queries

---
 config/identical-files.json                   |  2 +-
 .../security/performance/ReDoSUtil.qll        | 81 ++++++++++++++++---
 .../security/performance/RegExpTreeView.qll   | 38 +++++++++
 .../ReDoS/PolynomialBackTracking.expected     |  4 +
 .../Performance/ReDoS/ReDoS.expected          |  2 +
 .../test/query-tests/Performance/ReDoS/tst.js |  8 +-
 python/ql/lib/semmle/python/RegexTreeView.qll |  8 ++
 .../python/security/performance/ReDoSUtil.qll | 81 ++++++++++++++++---
 .../security/performance/RegExpTreeView.qll   | 29 +++++++
 9 files changed, 233 insertions(+), 20 deletions(-)

diff --git a/config/identical-files.json b/config/identical-files.json
index 037b0126cb3..74ef7b82323 100644
--- a/config/identical-files.json
+++ b/config/identical-files.json
@@ -462,4 +462,4 @@
     "javascript/ql/lib/semmle/javascript/security/performance/SuperlinearBackTracking.qll",
     "python/ql/lib/semmle/python/security/performance/SuperlinearBackTracking.qll"
   ]
-}
+}
\ No newline at end of file
diff --git a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll
index 43601b1d407..cdd42ce1540 100644
--- a/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll
+++ b/javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll
@@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm {
 
 /**
  * Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
+ * The string representation includes which flags are used with the regular expression.
  *
  * Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
  * The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
@@ -173,26 +174,54 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
     min(RelevantRegExpTerm t, Location loc, File file |
       loc = t.getLocation() and
       file = t.getFile() and
-      str = t.getRawValue()
+      str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
     |
       t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
     )
 }
 
+/**
+ * Gets a string reperesentation of the flags used with the regular expression.
+ * Only the flags that are relevant for the canonicalization are included.
+ */
+string getCanonicalizationFlags(RegExpTerm root) {
+  root.isRootTerm() and
+  (
+    RegExpFlags::isIgnoreCase(root) and
+    result = "i"
+    or
+    not RegExpFlags::isIgnoreCase(root) and
+    result = ""
+  )
+}
+
 /**
  * An abstract input symbol, representing a set of concrete characters.
  */
 private newtype TInputSymbol =
   /** An input symbol corresponding to character `c`. */
   Char(string c) {
-    c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_)
+    c =
+      any(RegexpCharacterConstant cc |
+        cc instanceof RelevantRegExpTerm and
+        not RegExpFlags::isIgnoreCase(cc.getRootTerm())
+      ).getValue().charAt(_)
+    or
+    // normalize to lower case if the regexp is case insensitive
+    c =
+      any(RegexpCharacterConstant cc, string char |
+        cc instanceof RelevantRegExpTerm and
+        RegExpFlags::isIgnoreCase(cc.getRootTerm()) and
+        char = cc.getValue().charAt(_)
+      |
+        char.toLowerCase()
+      )
   } or
   /**
    * An input symbol representing all characters matched by
    * a (non-universal) character class that has string representation `charClassString`.
    */
   CharClass(string charClassString) {
-    exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and
     exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
       recc instanceof RegExpCharacterClass and
       not recc.(RegExpCharacterClass).isUniversalClass()
@@ -293,6 +322,19 @@ private module CharacterClasses {
    */
   pragma[noinline]
   predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
+    if RegExpFlags::isIgnoreCase(cc.getRootTerm())
+    then
+      // normalize everything to lower case if the regexp is case insensitive
+      exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase())
+    else hasChildThatMatchesIgnoringCasing(cc, char)
+  }
+
+  /**
+   * Holds if the character class `cc` has a child (constant or range) that matches `char`.
+   * Ignores whether the character class is inside a regular expression that ignores casing.
+   */
+  pragma[noinline]
+  predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) {
     exists(getCanonicalCharClass(cc)) and
     exists(RegExpTerm child | child = cc.getAChild() |
       char = child.(RegexpCharacterConstant).getValue()
@@ -508,7 +550,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
 /**
  * Gets a state the NFA may be in after matching `t`.
  */
-private State after(RegExpTerm t) {
+State after(RegExpTerm t) {
   exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
   or
   exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@@ -537,7 +579,14 @@ private State after(RegExpTerm t) {
 predicate delta(State q1, EdgeLabel lbl, State q2) {
   exists(RegexpCharacterConstant s, int i |
     q1 = Match(s, i) and
-    lbl = Char(s.getValue().charAt(i)) and
+    (
+      not RegExpFlags::isIgnoreCase(s.getRootTerm()) and
+      lbl = Char(s.getValue().charAt(i))
+      or
+      // normalizing to lower case if ignorecase flag is set
+      RegExpFlags::isIgnoreCase(s.getRootTerm()) and
+      exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
+    ) and
     (
       q2 = Match(s, i + 1)
       or
@@ -547,20 +596,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
   )
   or
   exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
-    if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
+    if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot()
   )
   or
   exists(RegExpCharacterClass cc |
     cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
     or
     q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue()) and
+    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
     q2 = after(cc)
   )
   or
   exists(RegExpCharacterClassEscape cc |
     q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue()) and
+    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
     q2 = after(cc)
   )
   or
@@ -627,13 +676,27 @@ RegExpRoot getRoot(RegExpTerm term) {
   result = getRoot(term.getParent())
 }
 
-private newtype TState =
+/**
+ * A state in the NFA.
+ */
+newtype TState =
+  /**
+   * A state representing that the NFA is about to match a term.
+   * `i` is used to index into multi-char literals.
+   */
   Match(RelevantRegExpTerm t, int i) {
     i = 0
     or
     exists(t.(RegexpCharacterConstant).getValue().charAt(i))
   } or
+  /**
+   * An accept state, where exactly the given input string is accepted.
+   */
   Accept(RegExpRoot l) { l.isRelevant() } or
+  /**
+   * An accept state, where the given input string, or any string that has this
+   * string as a prefix, is accepted.
+   */
   AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }
 
 /**
diff --git a/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll b/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll
index f730f62f5b8..f896e44f5e9 100644
--- a/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll
+++ b/javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll
@@ -12,3 +12,41 @@ import javascript
  * For javascript we make the pragmatic performance optimization to ignore minified files.
  */
 predicate isExcluded(RegExpParent parent) { parent.(Expr).getTopLevel().isMinified() }
+
+/**
+ * A module containing predicates for determining which flags a regular expression have.
+ */
+module RegExpFlags {
+  /**
+   * Holds if `root` has the `i` flag for case-insensitive matching.
+   */
+  predicate isIgnoreCase(RegExpTerm root) {
+    root.isRootTerm() and
+    exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
+      RegExp::isIgnoreCase(node.getFlags())
+    )
+  }
+
+  /**
+   * Gets the flags for `root`, or the empty string if `root` has no flags.
+   */
+  string getFlags(RegExpTerm root) {
+    root.isRootTerm() and
+    exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
+      result = node.getFlags()
+      or
+      not exists(node.getFlags()) and
+      result = ""
+    )
+  }
+
+  /**
+   * Holds if `root` has the `s` flag for multi-line matching.
+   */
+  predicate isDotAll(RegExpTerm root) {
+    root.isRootTerm() and
+    exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
+      RegExp::isDotAll(node.getFlags())
+    )
+  }
+}
diff --git a/javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected b/javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected
index 9dce8af29ad..e38b5f9326d 100644
--- a/javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected
+++ b/javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected
@@ -502,3 +502,7 @@
 | tst.js:375:15:375:16 | x* | Strings with many repetitions of 'x' can start matching anywhere after the start of the preceeding (x*)+(?=$\|y) |
 | tst.js:378:16:378:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$) |
 | tst.js:379:16:379:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$\|y) |
+| tst.js:381:15:381:24 | (foo\|FOO)* | Strings with many repetitions of 'FOO' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar |
+| tst.js:382:14:382:23 | (foo\|FOO)* | Strings with many repetitions of 'foo' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar |
+| tst.js:384:15:384:26 | ([AB]\|[ab])* | Strings with many repetitions of 'A' can start matching anywhere after the start of the preceeding ([AB]\|[ab])*C |
+| tst.js:385:14:385:25 | ([DE]\|[de])* | Strings with many repetitions of 'd' can start matching anywhere after the start of the preceeding ([DE]\|[de])*F |
diff --git a/javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected b/javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected
index 05352f7a20d..fe23342eb02 100644
--- a/javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected
+++ b/javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected
@@ -178,3 +178,5 @@
 | tst.js:375:15:375:16 | x* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'x'. |
 | tst.js:378:16:378:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
 | tst.js:379:16:379:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
+| tst.js:382:14:382:23 | (foo\|FOO)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foo'. |
+| tst.js:385:14:385:25 | ([DE]\|[de])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'd'. |
diff --git a/javascript/ql/test/query-tests/Performance/ReDoS/tst.js b/javascript/ql/test/query-tests/Performance/ReDoS/tst.js
index 762929080ae..c3521343111 100644
--- a/javascript/ql/test/query-tests/Performance/ReDoS/tst.js
+++ b/javascript/ql/test/query-tests/Performance/ReDoS/tst.js
@@ -376,4 +376,10 @@ var bad90 = /(x*)+(?=$|y)/
 
 // GOOD - but we spuriously conclude that a rejecting suffix exists.
 var good44 = /([\s\S]*)+(?=$)/;
-var good45 = /([\s\S]*)+(?=$|y)/;
\ No newline at end of file
+var good45 = /([\s\S]*)+(?=$|y)/;
+
+var good46 = /(foo|FOO)*bar/;
+var bad91 = /(foo|FOO)*bar/i;
+
+var good47 = /([AB]|[ab])*C/;
+var bad92 = /([DE]|[de])*F/i;
diff --git a/python/ql/lib/semmle/python/RegexTreeView.qll b/python/ql/lib/semmle/python/RegexTreeView.qll
index cc4c08c6364..784ea06ca29 100644
--- a/python/ql/lib/semmle/python/RegexTreeView.qll
+++ b/python/ql/lib/semmle/python/RegexTreeView.qll
@@ -61,6 +61,14 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent {
 
   predicate isDotAll() { re.getAMode() = "DOTALL" }
 
+  predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" }
+
+  string getFlags() {
+    not exists(re.getAMode()) and result = ""
+    or
+    result = strictconcat(string mode | mode = re.getAMode() | mode, " | ")
+  }
+
   override Regex getRegex() { result = re }
 
   string getPrimaryQLClass() { result = "RegExpLiteral" }
diff --git a/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll b/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll
index 43601b1d407..cdd42ce1540 100644
--- a/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll
+++ b/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll
@@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm {
 
 /**
  * Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
+ * The string representation includes which flags are used with the regular expression.
  *
  * Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
  * The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
@@ -173,26 +174,54 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
     min(RelevantRegExpTerm t, Location loc, File file |
       loc = t.getLocation() and
       file = t.getFile() and
-      str = t.getRawValue()
+      str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
     |
       t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
     )
 }
 
+/**
+ * Gets a string reperesentation of the flags used with the regular expression.
+ * Only the flags that are relevant for the canonicalization are included.
+ */
+string getCanonicalizationFlags(RegExpTerm root) {
+  root.isRootTerm() and
+  (
+    RegExpFlags::isIgnoreCase(root) and
+    result = "i"
+    or
+    not RegExpFlags::isIgnoreCase(root) and
+    result = ""
+  )
+}
+
 /**
  * An abstract input symbol, representing a set of concrete characters.
  */
 private newtype TInputSymbol =
   /** An input symbol corresponding to character `c`. */
   Char(string c) {
-    c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_)
+    c =
+      any(RegexpCharacterConstant cc |
+        cc instanceof RelevantRegExpTerm and
+        not RegExpFlags::isIgnoreCase(cc.getRootTerm())
+      ).getValue().charAt(_)
+    or
+    // normalize to lower case if the regexp is case insensitive
+    c =
+      any(RegexpCharacterConstant cc, string char |
+        cc instanceof RelevantRegExpTerm and
+        RegExpFlags::isIgnoreCase(cc.getRootTerm()) and
+        char = cc.getValue().charAt(_)
+      |
+        char.toLowerCase()
+      )
   } or
   /**
    * An input symbol representing all characters matched by
    * a (non-universal) character class that has string representation `charClassString`.
    */
   CharClass(string charClassString) {
-    exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and
     exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
       recc instanceof RegExpCharacterClass and
       not recc.(RegExpCharacterClass).isUniversalClass()
@@ -293,6 +322,19 @@ private module CharacterClasses {
    */
   pragma[noinline]
   predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
+    if RegExpFlags::isIgnoreCase(cc.getRootTerm())
+    then
+      // normalize everything to lower case if the regexp is case insensitive
+      exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase())
+    else hasChildThatMatchesIgnoringCasing(cc, char)
+  }
+
+  /**
+   * Holds if the character class `cc` has a child (constant or range) that matches `char`.
+   * Ignores whether the character class is inside a regular expression that ignores casing.
+   */
+  pragma[noinline]
+  predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) {
     exists(getCanonicalCharClass(cc)) and
     exists(RegExpTerm child | child = cc.getAChild() |
       char = child.(RegexpCharacterConstant).getValue()
@@ -508,7 +550,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
 /**
  * Gets a state the NFA may be in after matching `t`.
  */
-private State after(RegExpTerm t) {
+State after(RegExpTerm t) {
   exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
   or
   exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@@ -537,7 +579,14 @@ private State after(RegExpTerm t) {
 predicate delta(State q1, EdgeLabel lbl, State q2) {
   exists(RegexpCharacterConstant s, int i |
     q1 = Match(s, i) and
-    lbl = Char(s.getValue().charAt(i)) and
+    (
+      not RegExpFlags::isIgnoreCase(s.getRootTerm()) and
+      lbl = Char(s.getValue().charAt(i))
+      or
+      // normalizing to lower case if ignorecase flag is set
+      RegExpFlags::isIgnoreCase(s.getRootTerm()) and
+      exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
+    ) and
     (
       q2 = Match(s, i + 1)
       or
@@ -547,20 +596,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
   )
   or
   exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
-    if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
+    if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot()
   )
   or
   exists(RegExpCharacterClass cc |
     cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
     or
     q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue()) and
+    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
     q2 = after(cc)
   )
   or
   exists(RegExpCharacterClassEscape cc |
     q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue()) and
+    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
     q2 = after(cc)
   )
   or
@@ -627,13 +676,27 @@ RegExpRoot getRoot(RegExpTerm term) {
   result = getRoot(term.getParent())
 }
 
-private newtype TState =
+/**
+ * A state in the NFA.
+ */
+newtype TState =
+  /**
+   * A state representing that the NFA is about to match a term.
+   * `i` is used to index into multi-char literals.
+   */
   Match(RelevantRegExpTerm t, int i) {
     i = 0
     or
     exists(t.(RegexpCharacterConstant).getValue().charAt(i))
   } or
+  /**
+   * An accept state, where exactly the given input string is accepted.
+   */
   Accept(RegExpRoot l) { l.isRelevant() } or
+  /**
+   * An accept state, where the given input string, or any string that has this
+   * string as a prefix, is accepted.
+   */
   AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }
 
 /**
diff --git a/python/ql/lib/semmle/python/security/performance/RegExpTreeView.qll b/python/ql/lib/semmle/python/security/performance/RegExpTreeView.qll
index 56a6c90ffde..4fbe32f7e7e 100644
--- a/python/ql/lib/semmle/python/security/performance/RegExpTreeView.qll
+++ b/python/ql/lib/semmle/python/security/performance/RegExpTreeView.qll
@@ -18,3 +18,32 @@ predicate isExcluded(RegExpParent parent) {
   // we explicitly exclude these.
   count(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10
 }
+
+/**
+ * A module containing predicates for determining which flags a regular expression have.
+ */
+module RegExpFlags {
+  /**
+   * Holds if `root` has the `i` flag for case-insensitive matching.
+   */
+  predicate isIgnoreCase(RegExpTerm root) {
+    root.isRootTerm() and
+    root.getLiteral().isIgnoreCase()
+  }
+
+  /**
+   * Gets the flags for `root`, or the empty string if `root` has no flags.
+   */
+  string getFlags(RegExpTerm root) {
+    root.isRootTerm() and
+    result = root.getLiteral().getFlags()
+  }
+
+  /**
+   * Holds if `root` has the `s` flag for multi-line matching.
+   */
+  predicate isDotAll(RegExpTerm root) {
+    root.isRootTerm() and
+    root.getLiteral().isDotAll()
+  }
+}