Merge pull request #14339 from erik-krogh/range-printing

JS/PY/RB/Java: escape unicode chars in overly-large-range
2025-12-24 04:36:35 +01:00 · 2023-10-09 14:22:38 +02:00
parent 57e32b47b7 5d4b542995
commit 625e889c62
5 changed files with 87 additions and 76 deletions
--- a/python/ql/test/query-tests/Security/CWE-020-SuspiciousRegexpRange/OverlyLargeRangeQuery.expected
+++ b/python/ql/test/query-tests/Security/CWE-020-SuspiciousRegexpRange/OverlyLargeRangeQuery.expected
@@ -8,3 +8,4 @@
 | test.py:25:32:25:34 | 7-F | Suspicious character range that is equivalent to [7-9:;<=>?@A-F]. |
 | test.py:27:36:27:38 | 0-9 | Suspicious character range that overlaps with \\d in the same character class. |
 | test.py:29:39:29:41 | .-? | Suspicious character range that overlaps with \\w in the same character class, and is equivalent to [.\\/0-9:;<=>?]. |
+| test.py:31:30:31:32 | \ufffd-\ufffd | Suspicious character range that overlaps with \\ufffd-\\ufffd in the same character class. |
--- a/python/ql/test/query-tests/Security/CWE-020-SuspiciousRegexpRange/test.py
+++ b/python/ql/test/query-tests/Security/CWE-020-SuspiciousRegexpRange/test.py
@@ -27,3 +27,5 @@ numberToLetter = re.compile(r'[7-F]') # NOT OK
 overlapsWithClass1 = re.compile(r'[0-9\d]') # NOT OK

 overlapsWithClass2 = re.compile(r'[\w,.-?:*+]') # NOT OK
+
+unicodeStuff =  re.compile('[\U0001D173-\U0001D17A\U000E0020-\U000E007F\U000e0001]') # NOT OK
--- a/shared/regex/codeql/regex/OverlyLargeRangeQuery.qll
+++ b/shared/regex/codeql/regex/OverlyLargeRangeQuery.qll
@@ -9,6 +9,7 @@ private import RegexTreeView
 */
 module Make<RegexTreeViewSig TreeImpl> {
  private import TreeImpl
+  private import codeql.util.Strings as Strings

  /**
   * Gets a rank for `range` that is unique for ranges in the same file.
@@ -213,7 +214,7 @@ module Make<RegexTreeViewSig TreeImpl> {
    bindingset[char]
    private string escape(string char) {
      exists(string reg | reg = "(\\[|\\]|\\\\|-|/)" |
-        if char.regexpMatch(reg) then result = "\\" + char else result = char
+        if char.regexpMatch(reg) then result = "\\" + char else result = Strings::escape(char)
      )
    }

@@ -257,14 +258,15 @@ module Make<RegexTreeViewSig TreeImpl> {
    or
    priority = 1 and
    exists(RegExpCharacterRange other |
-      reason = "overlaps with " + other + " in the same character class" and
+      reason = "overlaps with " + Strings::escape(other.toString()) + " in the same character class" and
      rankRange(result) < rankRange(other) and
      overlap(result, other)
    )
    or
    priority = 2 and
    exists(RegExpCharacterClassEscape escape |
-      reason = "overlaps with " + escape + " in the same character class" and
+      reason =
+        "overlaps with " + escapeRegExpCharacterClassEscape(escape) + " in the same character class" and
      overlapsWithCharEscape(result, escape)
    )
    or
@@ -276,6 +278,13 @@ module Make<RegexTreeViewSig TreeImpl> {
    )
  }

+  pragma[inline]
+  private string escapeRegExpCharacterClassEscape(RegExpCharacterClassEscape escape) {
+    if escape.toString().matches("%-%")
+    then result = Strings::escape(escape.toString()) // might contain unicode characters
+    else result = escape.toString() // just a plain `\d` or `\w` etc. Those are already escaped.
+  }
+
  /** Holds if `range` matches suspiciously many characters. */
  predicate problem(RegExpCharacterRange range, string reason) {
    reason =
--- a/shared/regex/codeql/regex/nfa/NfaUtils.qll
+++ b/shared/regex/codeql/regex/nfa/NfaUtils.qll
@@ -4,6 +4,7 @@

 private import codeql.regex.RegexTreeView
 private import codeql.util.Numbers
+private import codeql.util.Strings

 /**
 * Classes and predicates that create an NFA and various algorithms for working with it.
@@ -15,34 +16,7 @@ module Make<RegexTreeViewSig TreeImpl> {
   * Gets the char after `c` (from a simplified ASCII table).
   */
  private string nextChar(string c) {
-    exists(int code | code = ascii(c) | code + 1 = ascii(result))
-  }
-
-  /**
-   * Gets the `i`th codepoint in `s`.
-   */
-  bindingset[s]
-  private string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) }
-
-  /**
-   * Gets the length of `s` in codepoints.
-   */
-  bindingset[str]
-  private int getCodepointLength(string str) {
-    result = str.regexpReplaceAll("(.|\\s)", "x").length()
-  }
-
-  /**
-   * Gets an approximation for the ASCII code for `char`.
-   * Only the easily printable chars are included (so no newline, tab, null, etc).
-   */
-  private int ascii(string char) {
-    char =
-      rank[result](string c |
-        c =
-          "! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
-              .charAt(_)
-      )
+    exists(int code | code = asciiPrintable(c) | code + 1 = asciiPrintable(result))
  }

  /**
@@ -394,7 +368,7 @@ module Make<RegexTreeViewSig TreeImpl> {
     * Includes all printable ascii chars, all constants mentioned in a regexp, and all chars matches by the regexp `/\s|\d|\w/`.
     */
    string getARelevantChar() {
-      exists(ascii(result))
+      exists(asciiPrintable(result))
      or
      exists(RegexpCharacterConstant c | result = getCodepointAt(c.getValue(), _))
      or
@@ -1191,7 +1165,7 @@ module Make<RegexTreeViewSig TreeImpl> {
      private string relevant(RegExpRoot root) {
        root = relevantRoot() and
        (
-          exists(ascii(result)) and exists(root)
+          exists(asciiPrintable(result)) and exists(root)
          or
          exists(InputSymbol s | belongsTo(s, root) | result = intersect(s, _))
          or
@@ -1322,49 +1296,6 @@ module Make<RegexTreeViewSig TreeImpl> {
      )
    }

-    /**
-     * Gets the result of backslash-escaping newlines, carriage-returns and
-     * backslashes in `s`.
-     */
-    bindingset[s]
-    private string escape(string s) {
-      result =
-        escapeUnicodeString(s.replaceAll("\\", "\\\\")
-              .replaceAll("\n", "\\n")
-              .replaceAll("\r", "\\r")
-              .replaceAll("\t", "\\t"))
-    }
-
-    /**
-     * Gets a string where the unicode characters in `s` have been escaped.
-     */
-    bindingset[s]
-    private string escapeUnicodeString(string s) {
-      result =
-        concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i)
-    }
-
-    /**
-     * Gets a unicode escaped string for `char`.
-     * If `char` is a printable char, then `char` is returned.
-     */
-    bindingset[char]
-    private string escapeUnicodeChar(string char) {
-      if isPrintable(char)
-      then result = char
-      else
-        if exists(to4digitHex(any(int i | i.toUnicode() = char)))
-        then result = "\\u" + to4digitHex(any(int i | i.toUnicode() = char))
-        else result = "\\u{" + toHex(any(int i | i.toUnicode() = char)) + "}"
-    }
-
-    /** Holds if `char` is easily printable char, or whitespace. */
-    private predicate isPrintable(string char) {
-      exists(ascii(char))
-      or
-      char = "\n\r\t".charAt(_)
-    }
-
    /**
     * Gets `str` with the last `i` characters moved to the front.
     *
--- a/shared/util/codeql/util/Strings.qll
+++ b/shared/util/codeql/util/Strings.qll
@@ -0,0 +1,68 @@
+private import Numbers
+
+/**
+ * Gets the result of backslash-escaping newlines, carriage-returns, backslashes, and unicode characters in `s`.
+ */
+bindingset[s]
+string escape(string s) {
+  result =
+    escapeUnicodeString(s.replaceAll("\\", "\\\\")
+          .replaceAll("\n", "\\n")
+          .replaceAll("\r", "\\r")
+          .replaceAll("\t", "\\t"))
+}
+
+/**
+ * Gets a string where the unicode characters in `s` have been escaped.
+ */
+bindingset[s]
+private string escapeUnicodeString(string s) {
+  result =
+    concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i)
+}
+
+/**
+ * Gets a unicode escaped string for `char`.
+ * If `char` is a printable char, then `char` is returned.
+ */
+bindingset[char]
+private string escapeUnicodeChar(string char) {
+  if isPrintable(char)
+  then result = char
+  else
+    if exists(to4digitHex(any(int i | i.toUnicode() = char)))
+    then result = "\\u" + to4digitHex(any(int i | i.toUnicode() = char))
+    else result = "\\u{" + toHex(any(int i | i.toUnicode() = char)) + "}"
+}
+
+/** Holds if `char` is easily printable char, or whitespace. */
+private predicate isPrintable(string char) {
+  exists(asciiPrintable(char))
+  or
+  char = "\n\r\t".charAt(_)
+}
+
+/**
+ * Gets the `i`th codepoint in `s`.
+ */
+bindingset[s]
+string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) }
+
+/**
+ * Gets the length of `s` in codepoints.
+ */
+bindingset[str]
+int getCodepointLength(string str) { result = str.regexpReplaceAll("(.|\\s)", "x").length() }
+
+/**
+ * Gets the ASCII code for `char`.
+ * Only the easily printable chars are included (so no newline, tab, null, etc).
+ */
+int asciiPrintable(string char) {
+  char =
+    rank[result](string c |
+      c =
+        "! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
+            .charAt(_)
+    )
+}