diff --git a/python/ql/test/query-tests/Security/CWE-020-SuspiciousRegexpRange/OverlyLargeRangeQuery.expected b/python/ql/test/query-tests/Security/CWE-020-SuspiciousRegexpRange/OverlyLargeRangeQuery.expected index bc017c35e80..5f89de5eeb6 100644 --- a/python/ql/test/query-tests/Security/CWE-020-SuspiciousRegexpRange/OverlyLargeRangeQuery.expected +++ b/python/ql/test/query-tests/Security/CWE-020-SuspiciousRegexpRange/OverlyLargeRangeQuery.expected @@ -8,3 +8,4 @@ | test.py:25:32:25:34 | 7-F | Suspicious character range that is equivalent to [7-9:;<=>?@A-F]. | | test.py:27:36:27:38 | 0-9 | Suspicious character range that overlaps with \\d in the same character class. | | test.py:29:39:29:41 | .-? | Suspicious character range that overlaps with \\w in the same character class, and is equivalent to [.\\/0-9:;<=>?]. | +| test.py:31:30:31:32 | \ufffd-\ufffd | Suspicious character range that overlaps with \\ufffd-\\ufffd in the same character class. | diff --git a/python/ql/test/query-tests/Security/CWE-020-SuspiciousRegexpRange/test.py b/python/ql/test/query-tests/Security/CWE-020-SuspiciousRegexpRange/test.py index 9897e0cc6ba..43380ccef0d 100644 --- a/python/ql/test/query-tests/Security/CWE-020-SuspiciousRegexpRange/test.py +++ b/python/ql/test/query-tests/Security/CWE-020-SuspiciousRegexpRange/test.py @@ -27,3 +27,5 @@ numberToLetter = re.compile(r'[7-F]') # NOT OK overlapsWithClass1 = re.compile(r'[0-9\d]') # NOT OK overlapsWithClass2 = re.compile(r'[\w,.-?:*+]') # NOT OK + +unicodeStuff = re.compile('[\U0001D173-\U0001D17A\U000E0020-\U000E007F\U000e0001]') # NOT OK \ No newline at end of file diff --git a/shared/regex/codeql/regex/OverlyLargeRangeQuery.qll b/shared/regex/codeql/regex/OverlyLargeRangeQuery.qll index aad6e400583..c089bc1b9b7 100644 --- a/shared/regex/codeql/regex/OverlyLargeRangeQuery.qll +++ b/shared/regex/codeql/regex/OverlyLargeRangeQuery.qll @@ -9,6 +9,7 @@ private import RegexTreeView */ module Make { private import TreeImpl + private import codeql.util.Strings as Strings /** * Gets a rank for `range` that is unique for ranges in the same file. @@ -213,7 +214,7 @@ module Make { bindingset[char] private string escape(string char) { exists(string reg | reg = "(\\[|\\]|\\\\|-|/)" | - if char.regexpMatch(reg) then result = "\\" + char else result = char + if char.regexpMatch(reg) then result = "\\" + char else result = Strings::escape(char) ) } @@ -257,14 +258,15 @@ module Make { or priority = 1 and exists(RegExpCharacterRange other | - reason = "overlaps with " + other + " in the same character class" and + reason = "overlaps with " + Strings::escape(other.toString()) + " in the same character class" and rankRange(result) < rankRange(other) and overlap(result, other) ) or priority = 2 and exists(RegExpCharacterClassEscape escape | - reason = "overlaps with " + escape + " in the same character class" and + reason = + "overlaps with " + escapeRegExpCharacterClassEscape(escape) + " in the same character class" and overlapsWithCharEscape(result, escape) ) or @@ -276,6 +278,13 @@ module Make { ) } + pragma[inline] + private string escapeRegExpCharacterClassEscape(RegExpCharacterClassEscape escape) { + if escape.toString().matches("%-%") + then result = Strings::escape(escape.toString()) // might contain unicode characters + else result = escape.toString() // just a plain `\d` or `\w` etc. Those are already escaped. + } + /** Holds if `range` matches suspiciously many characters. */ predicate problem(RegExpCharacterRange range, string reason) { reason = diff --git a/shared/regex/codeql/regex/nfa/NfaUtils.qll b/shared/regex/codeql/regex/nfa/NfaUtils.qll index c888b426d5e..4cdddd98e8f 100644 --- a/shared/regex/codeql/regex/nfa/NfaUtils.qll +++ b/shared/regex/codeql/regex/nfa/NfaUtils.qll @@ -4,6 +4,7 @@ private import codeql.regex.RegexTreeView private import codeql.util.Numbers +private import codeql.util.Strings /** * Classes and predicates that create an NFA and various algorithms for working with it. @@ -15,34 +16,7 @@ module Make { * Gets the char after `c` (from a simplified ASCII table). */ private string nextChar(string c) { - exists(int code | code = ascii(c) | code + 1 = ascii(result)) - } - - /** - * Gets the `i`th codepoint in `s`. - */ - bindingset[s] - private string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) } - - /** - * Gets the length of `s` in codepoints. - */ - bindingset[str] - private int getCodepointLength(string str) { - result = str.regexpReplaceAll("(.|\\s)", "x").length() - } - - /** - * Gets an approximation for the ASCII code for `char`. - * Only the easily printable chars are included (so no newline, tab, null, etc). - */ - private int ascii(string char) { - char = - rank[result](string c | - c = - "! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" - .charAt(_) - ) + exists(int code | code = asciiPrintable(c) | code + 1 = asciiPrintable(result)) } /** @@ -394,7 +368,7 @@ module Make { * Includes all printable ascii chars, all constants mentioned in a regexp, and all chars matches by the regexp `/\s|\d|\w/`. */ string getARelevantChar() { - exists(ascii(result)) + exists(asciiPrintable(result)) or exists(RegexpCharacterConstant c | result = getCodepointAt(c.getValue(), _)) or @@ -1191,7 +1165,7 @@ module Make { private string relevant(RegExpRoot root) { root = relevantRoot() and ( - exists(ascii(result)) and exists(root) + exists(asciiPrintable(result)) and exists(root) or exists(InputSymbol s | belongsTo(s, root) | result = intersect(s, _)) or @@ -1322,49 +1296,6 @@ module Make { ) } - /** - * Gets the result of backslash-escaping newlines, carriage-returns and - * backslashes in `s`. - */ - bindingset[s] - private string escape(string s) { - result = - escapeUnicodeString(s.replaceAll("\\", "\\\\") - .replaceAll("\n", "\\n") - .replaceAll("\r", "\\r") - .replaceAll("\t", "\\t")) - } - - /** - * Gets a string where the unicode characters in `s` have been escaped. - */ - bindingset[s] - private string escapeUnicodeString(string s) { - result = - concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i) - } - - /** - * Gets a unicode escaped string for `char`. - * If `char` is a printable char, then `char` is returned. - */ - bindingset[char] - private string escapeUnicodeChar(string char) { - if isPrintable(char) - then result = char - else - if exists(to4digitHex(any(int i | i.toUnicode() = char))) - then result = "\\u" + to4digitHex(any(int i | i.toUnicode() = char)) - else result = "\\u{" + toHex(any(int i | i.toUnicode() = char)) + "}" - } - - /** Holds if `char` is easily printable char, or whitespace. */ - private predicate isPrintable(string char) { - exists(ascii(char)) - or - char = "\n\r\t".charAt(_) - } - /** * Gets `str` with the last `i` characters moved to the front. * diff --git a/shared/util/codeql/util/Strings.qll b/shared/util/codeql/util/Strings.qll new file mode 100644 index 00000000000..c654b6209eb --- /dev/null +++ b/shared/util/codeql/util/Strings.qll @@ -0,0 +1,68 @@ +private import Numbers + +/** + * Gets the result of backslash-escaping newlines, carriage-returns, backslashes, and unicode characters in `s`. + */ +bindingset[s] +string escape(string s) { + result = + escapeUnicodeString(s.replaceAll("\\", "\\\\") + .replaceAll("\n", "\\n") + .replaceAll("\r", "\\r") + .replaceAll("\t", "\\t")) +} + +/** + * Gets a string where the unicode characters in `s` have been escaped. + */ +bindingset[s] +private string escapeUnicodeString(string s) { + result = + concat(int i, string char | char = escapeUnicodeChar(getCodepointAt(s, i)) | char order by i) +} + +/** + * Gets a unicode escaped string for `char`. + * If `char` is a printable char, then `char` is returned. + */ +bindingset[char] +private string escapeUnicodeChar(string char) { + if isPrintable(char) + then result = char + else + if exists(to4digitHex(any(int i | i.toUnicode() = char))) + then result = "\\u" + to4digitHex(any(int i | i.toUnicode() = char)) + else result = "\\u{" + toHex(any(int i | i.toUnicode() = char)) + "}" +} + +/** Holds if `char` is easily printable char, or whitespace. */ +private predicate isPrintable(string char) { + exists(asciiPrintable(char)) + or + char = "\n\r\t".charAt(_) +} + +/** + * Gets the `i`th codepoint in `s`. + */ +bindingset[s] +string getCodepointAt(string s, int i) { result = s.regexpFind("(.|\\s)", i, _) } + +/** + * Gets the length of `s` in codepoints. + */ +bindingset[str] +int getCodepointLength(string str) { result = str.regexpReplaceAll("(.|\\s)", "x").length() } + +/** + * Gets the ASCII code for `char`. + * Only the easily printable chars are included (so no newline, tab, null, etc). + */ +int asciiPrintable(string char) { + char = + rank[result](string c | + c = + "! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" + .charAt(_) + ) +}