Merge pull request #14481 from erik-krogh/proper-codepoints

ReDoS: use the new codePointAt and codePointCount methods instead of regex hacks
2026-04-22 07:15:15 +02:00 · 2023-10-13 09:35:55 +02:00
parent 2ddcd1d9cc fa1e8ee426
commit b1ad61e27d
3 changed files with 34 additions and 9 deletions
--- a/docs/codeql/ql-language-reference/ql-language-specification.rst
+++ b/docs/codeql/ql-language-reference/ql-language-specification.rst
@@ -1993,6 +1993,10 @@ The following built-in predicates are members of type ``int``:
 +-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
 | ``toUnicode``           | string      |                | The result is the unicode character for the receiver seen as a unicode code point.                             |
 +-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
+| ``codePointAt``         | int         | int            | The result is the unicode code point at the index given by the argument.                                       |
+-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+
+| ``codePointCount``      | int         | int, int       | The result is the number of unicode code points in the receiver between the given indices.                     |
+-------------------------+-------------+----------------+----------------------------------------------------------------------------------------------------------------+

 The leftmost bit after ``bitShiftRightSigned`` depends on sign extension, whereas after ``bitShiftRight`` it is zero.

--- a/shared/regex/codeql/regex/nfa/NfaUtils.qll
+++ b/shared/regex/codeql/regex/nfa/NfaUtils.qll
@@ -164,17 +164,17 @@ module Make<RegexTreeViewSig TreeImpl> {
    /** An input symbol corresponding to character `c`. */
    Char(string c) {
      c =
-        getCodepointAt(any(RegexpCharacterConstant cc |
+        getACodepoint(any(RegexpCharacterConstant cc |
            cc instanceof RelevantRegExpTerm and
            not isIgnoreCase(cc.getRootTerm())
-          ).getValue(), _)
+          ).getValue())
      or
      // normalize everything to lower case if the regexp is case insensitive
      c =
        any(RegexpCharacterConstant cc, string char |
          cc instanceof RelevantRegExpTerm and
          isIgnoreCase(cc.getRootTerm()) and
-          char = getCodepointAt(cc.getValue(), _)
+          char = getACodepoint(cc.getValue())
        |
          char.toLowerCase()
        )
@@ -370,7 +370,7 @@ module Make<RegexTreeViewSig TreeImpl> {
    string getARelevantChar() {
      exists(asciiPrintable(result))
      or
-      exists(RegexpCharacterConstant c | result = getCodepointAt(c.getValue(), _))
+      exists(RegexpCharacterConstant c | result = getACodepoint(c.getValue()))
      or
      classEscapeMatches(_, result)
    }
@@ -1258,7 +1258,7 @@ module Make<RegexTreeViewSig TreeImpl> {
       * Gets a `char` that occurs in a `pump` string.
       */
      private string getAProcessChar() {
-        result = getCodepointAt(any(string s | isReDoSCandidate(_, s)), _)
+        result = getACodepoint(any(string s | isReDoSCandidate(_, s)))
      }
    }

--- a/shared/util/codeql/util/Strings.qll
+++ b/shared/util/codeql/util/Strings.qll
@@ -18,7 +18,11 @@ string escape(string s) {
 bindingset[s]
 private string escapeUnicodeString(string s) {
  result =
-    concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i)
+    concat(int i, string char |
+      char = escapeUnicodeChar(s.codePointAt(i).toUnicode())
+    |
+      char order by i
+    )
 }

 /**
@@ -44,15 +48,32 @@ private predicate isPrintable(string char) {

 /**
 * Gets the `i`th codepoint in `s`.
+ * Unpaired surrogates are skipped.
 */
 bindingset[s]
-string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) }
+string getCodepointAt(string s, int i) {
+  // codePointAt returns the integer codePoint, so we need to convert to a string.
+  // codePointAt returns integers for both the high and low end. The invalid strings are filtered out by `toUnicode`, but we need to re-count the index, therefore the rank.
+  // rank is 1-indexed, so we need to offset for that to make this predicate 0-indexed.
+  result =
+    rank[i + 1](string char, int charIndex |
+      char = s.codePointAt(charIndex).toUnicode()
+    |
+      char order by charIndex
+    )
+}

 /**
- * Gets the length of `s` in codepoints.
+ * Gets any unicode character that appears in `s`.
+ */
+bindingset[s]
+string getACodepoint(string s) { result = s.codePointAt(_).toUnicode() }
+
+/**
+ * Gets the number of unicode codepoints in `s` not counting unpaired surrogates.
 */
 bindingset[str]
-int getCodepointLength(string str) { result = str.regexpReplaceAll("(.|\\s)", "x").length() }
+int getCodepointLength(string str) { result = str.codePointCount(0, str.length()) }

 /**
 * Gets the ASCII code for `char`.