add further normalization of char classses

2026-02-11 04:31:05 +01:00 · 2022-02-14 18:49:57 +01:00
parent 3be4a86acd
commit 7fb3d81d2f
9 changed files with 415 additions and 252 deletions
--- a/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll
+++ b/python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll
@@ -199,7 +199,7 @@ CharClass getCanonicalCharClass(RegExpTerm term) {
 /**
 * Holds if `a` and `b` are input symbols from the same regexp.
 */
-private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
+private predicate sharesRoot(InputSymbol a, InputSymbol b) {
  exists(RegExpRoot root |
    belongsTo(a, root) and
    belongsTo(b, root)
@@ -209,7 +209,7 @@ private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
 /**
 * Holds if the `a` is an input symbol from a regexp that has root `root`.
 */
-private predicate belongsTo(TInputSymbol a, RegExpRoot root) {
+private predicate belongsTo(InputSymbol a, RegExpRoot root) {
  exists(State s | getRoot(s.getRepr()) = root |
    delta(s, a, _)
    or
@@ -378,6 +378,13 @@ private module CharacterClasses {
    )
  }

+  bindingset[char, cc]
+  private string caseNormalize(string char, RegExpTerm cc) {
+    if RegExpFlags::isIgnoreCase(cc.getRootTerm())
+    then result = char.toLowerCase()
+    else result = char
+  }
+
  /**
   * An implementation of `CharacterClass` for positive (non inverted) character classes.
   */
@@ -386,7 +393,7 @@ private module CharacterClasses {

    PositiveCharacterClass() { this = getCanonicalCharClass(cc) and not cc.isInverted() }

-    override string getARelevantChar() { result = getAMentionedChar(cc) }
+    override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) }

    override predicate matches(string char) { hasChildThatMatches(cc, char) }
  }
@@ -400,8 +407,8 @@ private module CharacterClasses {
    InvertedCharacterClass() { this = getCanonicalCharClass(cc) and cc.isInverted() }

    override string getARelevantChar() {
-      result = nextChar(getAMentionedChar(cc)) or
-      nextChar(result) = getAMentionedChar(cc)
+      result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or
+      nextChar(result) = caseNormalize(getAMentionedChar(cc), cc)
    }

    bindingset[char]
@@ -428,13 +435,12 @@ private module CharacterClasses {
   */
  private class PositiveCharacterClassEscape extends CharacterClass {
    string charClass;
+    RegExpTerm cc;

    PositiveCharacterClassEscape() {
-      exists(RegExpTerm cc |
-        isEscapeClass(cc, charClass) and
-        this = getCanonicalCharClass(cc) and
-        charClass = ["d", "s", "w"]
-      )
+      isEscapeClass(cc, charClass) and
+      this = getCanonicalCharClass(cc) and
+      charClass = ["d", "s", "w"]
    }

    override string getARelevantChar() {
@@ -445,7 +451,9 @@ private module CharacterClasses {
      result = " "
      or
      charClass = "w" and
-      result = ["a", "Z", "_", "0", "9"]
+      if RegExpFlags::isIgnoreCase(cc.getRootTerm())
+      then result = ["a", "z", "_", "0", "9"]
+      else result = ["a", "Z", "_", "0", "9"]
    }

    override predicate matches(string char) { classEscapeMatches(charClass, char) }
@@ -492,6 +500,34 @@ private module CharacterClasses {
      not classEscapeMatches(charClass.toLowerCase(), char)
    }
  }
+
+  /** Gets a representative for all char classes that match the same chars as `c`. */
+  CharacterClass normalize(CharacterClass c) {
+    exists(string normalization |
+      normalization = getMormalizationString(c) and
+      result =
+        min(CharacterClass cc, string raw |
+          getMormalizationString(cc) = normalization and cc = CharClass(raw)
+        |
+          cc order by raw
+        )
+    )
+  }
+
+  /** Gets a string representing all the chars matched by `c` */
+  private string getMormalizationString(CharacterClass c) {
+    (c instanceof PositiveCharacterClass or c instanceof PositiveCharacterClassEscape) and
+    result = concat(string char | c.matches(char) and char = CharacterClasses::getARelevantChar())
+    or
+    (c instanceof InvertedCharacterClass or c instanceof NegativeCharacterClassEscape) and
+    // the string produced by the concat can not contain repeated chars
+    // so by starting the below with "nn" we can guarantee that
+    // it will not overlap with the above case.
+    // and a negative char class can never match the same chars as a positive one, so we don't miss any results from this.
+    result =
+      "nn:" +
+        concat(string char | not c.matches(char) and char = CharacterClasses::getARelevantChar())
+  }
 }

 private class EdgeLabel extends TInputSymbol {
@@ -620,13 +656,17 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
    cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
    or
    q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
+    lbl =
+      CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
+            getCanonicalizationFlags(cc.getRootTerm()))) and
    q2 = after(cc)
  )
  or
  exists(RegExpTerm cc | isEscapeClass(cc, _) |
    q1 = before(cc) and
-    lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
+    lbl =
+      CharacterClasses::normalize(CharClass(cc.getRawValue() + "|" +
+            getCanonicalizationFlags(cc.getRootTerm()))) and
    q2 = after(cc)
  )
  or