diff --git a/shared/regex/change-notes/2022-09-26-initial-version.md b/shared/regex/change-notes/2022-09-26-initial-version.md new file mode 100644 index 00000000000..e4d6e0490c2 --- /dev/null +++ b/shared/regex/change-notes/2022-09-26-initial-version.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +* Initial release. Extracted common regex related code, including the ReDoS analysis, into a library pack to share code between languages. diff --git a/shared/regex/codeql-pack.lock.yml b/shared/regex/codeql-pack.lock.yml new file mode 100644 index 00000000000..a046f6d9786 --- /dev/null +++ b/shared/regex/codeql-pack.lock.yml @@ -0,0 +1,4 @@ +--- +dependencies: {} +compiled: false +lockVersion: 1.0.0 \ No newline at end of file diff --git a/shared/regex/codeql/regex/OverlyLargeRangeQuery.qll b/shared/regex/codeql/regex/OverlyLargeRangeQuery.qll new file mode 100644 index 00000000000..8d3a0b9c0ff --- /dev/null +++ b/shared/regex/codeql/regex/OverlyLargeRangeQuery.qll @@ -0,0 +1,300 @@ +/** + * Classes and predicates for working with suspicious character ranges. + */ + +private import RegexTreeView + +/** + * Classes and predicates implementing an analysis detecting suspicious character ranges. + */ +module Make { + private import TreeImpl + + /** + * Gets a rank for `range` that is unique for ranges in the same file. + * Prioritizes ranges that match more characters. + */ + int rankRange(RegExpCharacterRange range) { + range = + rank[result](RegExpCharacterRange r, int startline, int startcolumn, int low, int high | + r.hasLocationInfo(_, startline, startcolumn, _, _) and + isRange(r, low, high) + | + r order by (high - low) desc, startline, startcolumn + ) + } + + /** Holds if `range` spans from the unicode code points `low` to `high` (both inclusive). */ + predicate isRange(RegExpCharacterRange range, int low, int high) { + exists(string lowc, string highc | + range.isRange(lowc, highc) and + low.toUnicode() = lowc and + high.toUnicode() = highc + ) + } + + /** Holds if `char` is an alpha-numeric character. */ + predicate isAlphanumeric(string char) { + // written like this to avoid having a bindingset for the predicate + char = [[48 .. 57], [65 .. 90], [97 .. 122]].toUnicode() // 0-9, A-Z, a-z + } + + /** + * Holds if the given ranges are from the same character class + * and there exists at least one character matched by both ranges. + */ + predicate overlap(RegExpCharacterRange a, RegExpCharacterRange b) { + exists(RegExpCharacterClass clz | + a = clz.getAChild() and + b = clz.getAChild() and + a != b + | + exists(int alow, int ahigh, int blow, int bhigh | + isRange(a, alow, ahigh) and + isRange(b, blow, bhigh) and + alow <= bhigh and + blow <= ahigh + ) + ) + } + + /** + * Holds if `range` overlaps with the char class `escape` from the same character class. + */ + predicate overlapsWithCharEscape(RegExpCharacterRange range, RegExpCharacterClassEscape escape) { + exists(RegExpCharacterClass clz, string low, string high | + range = clz.getAChild() and + escape = clz.getAChild() and + range.isRange(low, high) + | + escape.getValue() = "w" and + getInRange(low, high).regexpMatch("\\w") + or + escape.getValue() = "d" and + getInRange(low, high).regexpMatch("\\d") + or + escape.getValue() = "s" and + getInRange(low, high).regexpMatch("\\s") + ) + } + + /** Gets the unicode code point for a `char`. */ + bindingset[char] + int toCodePoint(string char) { result.toUnicode() = char } + + /** A character range that appears to be overly wide. */ + class OverlyWideRange instanceof RegExpCharacterRange { + OverlyWideRange() { + exists(int low, int high, int numChars | + isRange(this, low, high) and + numChars = (1 + high - low) and + this.getRootTerm().isUsedAsRegExp() and + numChars >= 10 + | + // across the Z-a range (which includes backticks) + toCodePoint("Z") >= low and + toCodePoint("a") <= high + or + // across the 9-A range (which includes e.g. ; and ?) + toCodePoint("9") >= low and + toCodePoint("A") <= high + or + // a non-alphanumeric char as part of the range boundaries + exists(int bound | bound = [low, high] | not isAlphanumeric(bound.toUnicode())) and + // while still being ascii + low < 128 and + high < 128 + ) and + // allowlist for known ranges + not this = allowedWideRanges() + } + + /** Gets a string representation of a character class that matches the same chars as this range. */ + string printEquivalent() { result = RangePrinter::printEquivalentCharClass(this) } + + /** Gets a string representation of this range. */ + string toString() { result = super.toString() } + + /** Holds if `lo` is the lower bound of this character range and `hi` the upper bound. */ + predicate isRange(string lo, string hi) { super.isRange(lo, hi) } + } + + /** Gets a range that should not be reported as an overly wide range. */ + RegExpCharacterRange allowedWideRanges() { + // ~ is the last printable ASCII character, it's used right in various wide ranges. + result.isRange(_, "~") + or + // the same with " " and "!". " " is the first printable character, and "!" is the first non-white-space printable character. + result.isRange([" ", "!"], _) + or + // the `[@-_]` range is intentional + result.isRange("@", "_") + or + // starting from the zero byte is a good indication that it's purposely matching a large range. + result.isRange(0.toUnicode(), _) + } + + /** Gets a char between (and including) `low` and `high`. */ + bindingset[low, high] + private string getInRange(string low, string high) { + result = [toCodePoint(low) .. toCodePoint(high)].toUnicode() + } + + /** A module computing an equivalent character class for an overly wide range. */ + module RangePrinter { + bindingset[char] + bindingset[result] + private string next(string char) { + exists(int prev, int next | + prev.toUnicode() = char and + next.toUnicode() = result and + next = prev + 1 + ) + } + + /** Gets the points where the parts of the pretty printed range should be cut off. */ + private string cutoffs() { result = ["A", "Z", "a", "z", "0", "9"] } + + /** Gets the char to use in the low end of a range for a given `cut` */ + private string lowCut(string cut) { + cut = ["A", "a", "0"] and + result = cut + or + cut = ["Z", "z", "9"] and + result = next(cut) + } + + /** Gets the char to use in the high end of a range for a given `cut` */ + private string highCut(string cut) { + cut = ["Z", "z", "9"] and + result = cut + or + cut = ["A", "a", "0"] and + next(result) = cut + } + + /** Gets the cutoff char used for a given `part` of a range when pretty-printing it. */ + private string cutoff(OverlyWideRange range, int part) { + exists(int low, int high | isRange(range, low, high) | + result = + rank[part + 1](string cut | + cut = cutoffs() and low < toCodePoint(cut) and toCodePoint(cut) < high + | + cut order by toCodePoint(cut) + ) + ) + } + + /** Gets the number of parts we should print for a given `range`. */ + private int parts(OverlyWideRange range) { result = 1 + count(cutoff(range, _)) } + + /** Holds if the given part of a range should span from `low` to `high`. */ + private predicate part(OverlyWideRange range, int part, string low, string high) { + // first part. + part = 0 and + ( + range.isRange(low, high) and + parts(range) = 1 + or + parts(range) >= 2 and + range.isRange(low, _) and + high = highCut(cutoff(range, part)) + ) + or + // middle + part >= 1 and + part < parts(range) - 1 and + low = lowCut(cutoff(range, part - 1)) and + high = highCut(cutoff(range, part)) + or + // last. + part = parts(range) - 1 and + low = lowCut(cutoff(range, part - 1)) and + range.isRange(_, high) + } + + /** Gets an escaped `char` for use in a character class. */ + bindingset[char] + private string escape(string char) { + exists(string reg | reg = "(\\[|\\]|\\\\|-|/)" | + if char.regexpMatch(reg) then result = "\\" + char else result = char + ) + } + + /** Gets a part of the equivalent range. */ + private string printEquivalentCharClass(OverlyWideRange range, int part) { + exists(string low, string high | part(range, part, low, high) | + if + isAlphanumeric(low) and + isAlphanumeric(high) + then result = low + "-" + high + else + result = + strictconcat(string char | char = getInRange(low, high) | escape(char) order by char) + ) + } + + /** Gets the entire pretty printed equivalent range. */ + string printEquivalentCharClass(OverlyWideRange range) { + result = + strictconcat(string r, int part | + r = "[" and part = -1 and exists(range) + or + r = printEquivalentCharClass(range, part) + or + r = "]" and part = parts(range) + | + r order by part + ) + } + } + + /** Gets a char range that is overly large because of `reason`. */ + RegExpCharacterRange getABadRange(string reason, int priority) { + result instanceof OverlyWideRange and + priority = 0 and + exists(string equiv | equiv = result.(OverlyWideRange).printEquivalent() | + if equiv.length() <= 50 + then reason = "is equivalent to " + equiv + else reason = "is equivalent to " + equiv.substring(0, 50) + "..." + ) + or + priority = 1 and + exists(RegExpCharacterRange other | + reason = "overlaps with " + other + " in the same character class" and + rankRange(result) < rankRange(other) and + overlap(result, other) + ) + or + priority = 2 and + exists(RegExpCharacterClassEscape escape | + reason = "overlaps with " + escape + " in the same character class" and + overlapsWithCharEscape(result, escape) + ) + or + reason = "is empty" and + priority = 3 and + exists(int low, int high | + isRange(result, low, high) and + low > high + ) + } + + /** Holds if `range` matches suspiciously many characters. */ + predicate problem(RegExpCharacterRange range, string reason) { + reason = + strictconcat(string m, int priority | + range = getABadRange(m, priority) + | + m, ", and " order by priority desc + ) and + // specifying a range using an escape is usually OK. + not range.getAChild() instanceof RegExpEscape and + // Unicode escapes in strings are interpreted before it turns into a regexp, + // so e.g. [\u0001-\uFFFF] will just turn up as a range between two constants. + // We therefore exclude these ranges. + range.getRootTerm().getParent() instanceof RegExpLiteral and + // is used as regexp (mostly for JS where regular expressions are parsed eagerly) + range.getRootTerm().isUsedAsRegExp() + } +} diff --git a/shared/regex/codeql/regex/RegexTreeView.qll b/shared/regex/codeql/regex/RegexTreeView.qll new file mode 100644 index 00000000000..f805bd83185 --- /dev/null +++ b/shared/regex/codeql/regex/RegexTreeView.qll @@ -0,0 +1,451 @@ +/** + * This file contains a `RegexTreeViewSig` module describing the syntax tree of regular expressions. + */ + +/** + * A signature describing the syntax tree of regular expressions. + */ +signature module RegexTreeViewSig { + /** + * An element used in some way as or in a regular expression. + * This class exists to have a common supertype that all languages can agree on. + */ + class Top; + + /** + * An element containing a regular expression term, that is, either + * a string literal (parsed as a regular expression; the root of the parse tree) + * or another regular expression term (a descendant of the root). + */ + class RegExpParent extends Top; + + /** + * A regular expression literal. + * + * Note that this class does not cover regular expressions constructed by calling the built-in + * `RegExp` function. + * + * Example: + * + * ``` + * /(?i)ab*c(d|e)$/ + * ``` + */ + class RegExpLiteral extends RegExpParent; + + /** + * A regular expression term, that is, a syntactic part of a regular expression. + * These are the tree nodes that form the parse tree of a regular expression literal. + */ + class RegExpTerm extends Top { + /** Gets a child term of this term. */ + RegExpTerm getAChild(); + + /** + * Holds if this is the root term of a regular expression. + */ + predicate isRootTerm(); + + /** + * Gets the parent term of this regular expression term, or the + * regular expression literal if this is the root term. + */ + RegExpParent getParent(); + + /** + * Holds if this term is part of a regular expression literal, or a string literal + * that is interpreted as a regular expression. + */ + predicate isUsedAsRegExp(); + + /** Gets the outermost term of this regular expression. */ + RegExpTerm getRootTerm(); + + /** Gets the raw source text of this term. */ + string getRawValue(); + + /** Gets the `i`th child term of this term. */ + RegExpTerm getChild(int i); + + /** Gets the number of child terms of this term. */ + int getNumChild(); + + /** Gets the regular expression term that is matched (textually) after this one, if any. */ + RegExpTerm getSuccessor(); + + string toString(); + + predicate hasLocationInfo( + string filepath, int startline, int startcolumn, int endline, int endcolumn + ); + } + + /** + * A quantified regular expression term. + * + * Example: + * + * ``` + * ((ECMA|Java)[sS]cript)* + * ``` + */ + class RegExpQuantifier extends RegExpTerm; + + /** + * A star-quantified term. + * + * Example: + * + * ``` + * \w* + * ``` + */ + class RegExpStar extends RegExpQuantifier; + + /** + * An optional term. + * + * Example: + * + * ``` + * ;? + * ``` + */ + class RegExpOpt extends RegExpQuantifier; + + /** + * A plus-quantified term. + * + * Example: + * + * ``` + * \w+ + * ``` + */ + class RegExpPlus extends RegExpQuantifier; + + /** + * A range-quantified term + * + * Examples: + * + * ``` + * \w{2,4} + * \w{2,} + * \w{2} + * ``` + */ + class RegExpRange extends RegExpQuantifier { + /** Gets the lower bound of the range. */ + int getLowerBound(); + + /** + * Gets the upper bound of the range, if any. + * + * If there is no upper bound, any number of repetitions is allowed. + * For a term of the form `r{lo}`, both the lower and the upper bound + * are `lo`. + */ + int getUpperBound(); + } + + /** + * An escaped regular expression term, that is, a regular expression + * term starting with a backslash. + * + * Example: + * + * ``` + * \. + * \w + * ``` + */ + class RegExpEscape extends RegExpTerm; + + /** + * A character class escape in a regular expression. + * + * Examples: + * + * ``` + * \w + * \S + * ``` + */ + class RegExpCharacterClassEscape extends RegExpEscape { + /** Gets the name of the character class; for example, `w` for `\w`. */ + string getValue(); + } + + /** + * An alternative term, that is, a term of the form `a|b`. + * + * Example: + * + * ``` + * ECMA|Java + * ``` + */ + class RegExpAlt extends RegExpTerm; + + /** + * A grouped regular expression. + * + * Examples: + * + * ``` + * (ECMA|Java) + * (?:ECMA|Java) + * (?['"]) + * ``` + */ + class RegExpGroup extends RegExpTerm { + /** + * Gets the index of this capture group within the enclosing regular + * expression literal. + * + * For example, in the regular expression `/((a?).)(?:b)/`, the + * group `((a?).)` has index 1, the group `(a?)` nested inside it + * has index 2, and the group `(?:b)` has no index, since it is + * not a capture group. + */ + int getNumber(); + } + + /** + * A back reference, that is, a term of the form `\i` or `\k` + * in a regular expression. + * + * Examples: + * + * ``` + * \1 + * \k + * ``` + */ + class RegExpBackRef extends RegExpTerm { + /** Gets the capture group this back reference refers to. */ + RegExpGroup getGroup(); + } + + /** + * A sequence term. + * + * Example: + * + * ``` + * (ECMA|Java)Script + * ``` + * + * This is a sequence with the elements `(ECMA|Java)` and `Script`. + */ + class RegExpSequence extends RegExpTerm; + + /** + * A zero-width lookahead or lookbehind assertion. + * + * Examples: + * + * ``` + * (?=\w) + * (?!\n) + * (?<=\.) + * (?&] + * ``` + */ + class RegExpCharacterClass extends RegExpTerm { + /** + * Holds if this character class matches any character. + */ + predicate isUniversalClass(); + + /** Holds if this is an inverted character class, that is, a term of the form `[^...]`. */ + predicate isInverted(); + } + + /** + * A character range in a character class in a regular expression. + * + * Example: + * + * ``` + * a-z + * ``` + */ + class RegExpCharacterRange extends RegExpTerm { + /** Holds if `lo` is the lower bound of this character range and `hi` the upper bound. */ + predicate isRange(string lo, string hi); + } + + /** + * A dot regular expression. + * + * Example: + * + * ``` + * . + * ``` + */ + class RegExpDot extends RegExpTerm; + + /** + * A dollar assertion `$` matching the end of a line. + * + * Example: + * + * ``` + * $ + * ``` + */ + class RegExpDollar extends RegExpTerm; + + /** + * A caret assertion `^` matching the beginning of a line. + * + * Example: + * + * ``` + * ^ + * ``` + */ + class RegExpCaret extends RegExpTerm; + + /** + * A word boundary assertion. + * + * Example: + * + * ``` + * \b + * ``` + */ + class RegExpWordBoundary extends RegExpTerm; + + /** + * A regular expression term that permits unlimited repetitions. + */ + class InfiniteRepetitionQuantifier extends RegExpQuantifier; + + /** + * Holds if the regular expression should not be considered. + * + * For javascript we make the pragmatic performance optimization to ignore minified files. + */ + predicate isExcluded(RegExpParent parent); + + /** + * Holds if `term` is a possessive quantifier. + * As javascript's regexes do not support possessive quantifiers, this never holds, but is used by the shared library. + */ + predicate isPossessive(RegExpQuantifier term); + + /** + * Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against. + * Not yet implemented for JavaScript. + */ + predicate matchesAnyPrefix(RegExpTerm term); + + /** + * Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against. + * Not yet implemented for JavaScript. + */ + predicate matchesAnySuffix(RegExpTerm term); + + /** + * Holds if `term` is an escape class representing e.g. `\d`. + * `clazz` is which character class it represents, e.g. "d" for `\d`. + */ + predicate isEscapeClass(RegExpTerm term, string clazz); + + /** + * Holds if `root` has the `i` flag for case-insensitive matching. + */ + predicate isIgnoreCase(RegExpTerm root); + + /** + * Holds if `root` has the `s` flag for multi-line matching. + */ + predicate isDotAll(RegExpTerm root); +} diff --git a/shared/regex/codeql/regex/nfa/BadTagFilterQuery.qll b/shared/regex/codeql/regex/nfa/BadTagFilterQuery.qll new file mode 100644 index 00000000000..c9c254fe990 --- /dev/null +++ b/shared/regex/codeql/regex/nfa/BadTagFilterQuery.qll @@ -0,0 +1,177 @@ +/** + * Provides predicates for reasoning about bad tag filter vulnerabilities. + */ + +private import NfaUtils as NfaUtils +private import RegexpMatching as RM +private import codeql.regex.RegexTreeView + +/** + * Module implementing classes and predicates reasoing about bad tag filter vulnerabilities. + */ +module Make { + private import TreeImpl + import RM::Make + + /** + * Holds if the regexp `root` should be tested against `str`. + * Implements the `isRegexpMatchingCandidateSig` signature from `RegexpMatching`. + * `ignorePrefix` toggles whether the regular expression should be treated as accepting any prefix if it's unanchored. + * `testWithGroups` toggles whether it's tested which groups are filled by a given input string. + */ + private predicate isBadTagFilterCandidate( + RootTerm root, string str, boolean ignorePrefix, boolean testWithGroups + ) { + // the regexp must mention "<" and ">" explicitly. + forall(string angleBracket | angleBracket = ["<", ">"] | + any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() = + root + ) and + ignorePrefix = true and + ( + str = ["", "", "", "", "", + "", "", "", "", + "", "", + "", "", "", + "", "", + "", "", + "") and + regexp.matches("") and + not regexp.matches("") and + ( + not regexp.matches("") and + msg = "This regular expression matches , but not " + or + not regexp.matches("") and + msg = "This regular expression matches , but not " + ) + or + regexp.matches("") and + regexp.matches("") and + not regexp.matches("") and + not regexp.matches("") and + msg = + "This regular expression does not match script tags where the attribute uses single-quotes." + or + regexp.matches("") and + regexp.matches("") and + not regexp.matches("") and + not regexp.matches("") and + msg = + "This regular expression does not match script tags where the attribute uses double-quotes." + or + regexp.matches("") and + regexp.matches("") and + not regexp.matches("") and + not regexp.matches("") and + not regexp.matches("") and + msg = + "This regular expression does not match script tags where tabs are used between attributes." + or + regexp.matches("") and + not isIgnoreCase(regexp) and + not regexp.matches("") and + not regexp.matches("") and + ( + not regexp.matches("") and + msg = "This regular expression does not match upper case ") and + regexp.matches("") and + msg = "This regular expression does not match mixed case ") and + not regexp.matches("") and + not regexp.matches("") and + ( + not regexp.matches("") and + msg = "This regular expression does not match script end tags like ." + or + not regexp.matches("") and + msg = "This regular expression does not match script end tags like ." + or + not regexp.matches("