['"]) - * ``` - */ -class RegExpGroup extends RegExpTerm, TRegExpGroup { - RegExpGroup() { this = TRegExpGroup(re, start, end) } - - /** - * Gets the index of this capture group within the enclosing regular - * expression literal. + * A star-quantified term. * - * For example, in the regular expression `/((a?).)(?:b)/`, the - * group `((a?).)` has index 1, the group `(a?)` nested inside it - * has index 2, and the group `(?:b)` has no index, since it is - * not a capture group. + * Example: + * + * ``` + * \w* + * ``` */ - int getNumber() { result = re.getGroupNumber(start, end) } + class RegExpStar extends InfiniteRepetitionQuantifier { + RegExpStar() { this.getQuantifier().charAt(0) = "*" } - /** Holds if this is a named capture group. */ - predicate isNamed() { exists(this.getName()) } + override string getPrimaryQLClass() { result = "RegExpStar" } + } - /** Gets the name of this capture group, if any. */ - string getName() { result = re.getGroupName(start, end) } + /** + * A plus-quantified term. + * + * Example: + * + * ``` + * \w+ + * ``` + */ + class RegExpPlus extends InfiniteRepetitionQuantifier { + RegExpPlus() { this.getQuantifier().charAt(0) = "+" } - override RegExpTerm getChild(int i) { - i = 0 and - exists(int in_start, int in_end | re.groupContents(start, end, in_start, in_end) | - result.occursInRegex(re, in_start, in_end) + override string getPrimaryQLClass() { result = "RegExpPlus" } + } + + /** + * An optional term. + * + * Example: + * + * ``` + * ;? + * ``` + */ + class RegExpOpt extends RegExpQuantifier { + RegExpOpt() { this.getQuantifier().charAt(0) = "?" } + + override string getPrimaryQLClass() { result = "RegExpOpt" } + } + + /** + * A range-quantified term + * + * Examples: + * + * ``` + * \w{2,4} + * \w{2,} + * \w{2} + * ``` + */ + class RegExpRange extends RegExpQuantifier { + string upper; + string lower; + + RegExpRange() { re.multiples(part_end, end, lower, upper) } + + /** Gets the string defining the upper bound of this range, which is empty when no such bound exists. */ + string getUpper() { result = upper } + + /** Gets the string defining the lower bound of this range, which is empty when no such bound exists. */ + string getLower() { result = lower } + + /** + * Gets the upper bound of the range, if any. + * + * If there is no upper bound, any number of repetitions is allowed. + * For a term of the form `r{lo}`, both the lower and the upper bound + * are `lo`. + */ + int getUpperBound() { result = this.getUpper().toInt() } + + /** Gets the lower bound of the range. */ + int getLowerBound() { result = this.getLower().toInt() } + + override string getPrimaryQLClass() { result = "RegExpRange" } + } + + /** + * A sequence term. + * + * Example: + * + * ``` + * (ECMA|Java)Script + * ``` + * + * This is a sequence with the elements `(ECMA|Java)` and `Script`. + */ + class RegExpSequence extends RegExpTerm, TRegExpSequence { + RegExpSequence() { this = TRegExpSequence(re, start, end) } + + override RegExpTerm getChild(int i) { result = seqChild(re, start, end, i) } + + /** Gets the element preceding `element` in this sequence. */ + RegExpTerm previousElement(RegExpTerm element) { element = this.nextElement(result) } + + /** Gets the element following `element` in this sequence. */ + RegExpTerm nextElement(RegExpTerm element) { + exists(int i | + element = this.getChild(i) and + result = this.getChild(i + 1) + ) + } + + override string getPrimaryQLClass() { result = "RegExpSequence" } + } + + pragma[nomagic] + private int seqChildEnd(Regex re, int start, int end, int i) { + result = seqChild(re, start, end, i).getEnd() + } + + // moved out so we can use it in the charpred + private RegExpTerm seqChild(Regex re, int start, int end, int i) { + re.sequence(start, end) and + ( + i = 0 and + exists(int itemEnd | + re.item(start, itemEnd) and + result.occursInRegex(re, start, itemEnd) + ) + or + i > 0 and + exists(int itemStart, int itemEnd | itemStart = seqChildEnd(re, start, end, i - 1) | + re.item(itemStart, itemEnd) and + result.occursInRegex(re, itemStart, itemEnd) + ) ) } - override string getPrimaryQLClass() { result = "RegExpGroup" } + /** + * An alternative term, that is, a term of the form `a|b`. + * + * Example: + * + * ``` + * ECMA|Java + * ``` + */ + class RegExpAlt extends RegExpTerm, TRegExpAlt { + RegExpAlt() { this = TRegExpAlt(re, start, end) } - /** Holds if this is the `n`th numbered group of literal `lit`. */ - predicate isNumberedGroupOfLiteral(RegExpLiteral lit, int n) { - lit = this.getLiteral() and n = this.getNumber() - } + override RegExpTerm getChild(int i) { + i = 0 and + exists(int part_end | + re.alternationOption(start, end, start, part_end) and + result.occursInRegex(re, start, part_end) + ) + or + i > 0 and + exists(int part_start, int part_end | + part_start = this.getChild(i - 1).getEnd() + 1 // allow for the | + | + re.alternationOption(start, end, part_start, part_end) and + result.occursInRegex(re, part_start, part_end) + ) + } - /** Holds if this is a group with name `name` of literal `lit`. */ - predicate isNamedGroupOfLiteral(RegExpLiteral lit, string name) { - lit = this.getLiteral() and name = this.getName() - } -} - -/** - * A special character in a regular expression. - * - * Examples: - * ``` - * ^ - * $ - * . - * ``` - */ -class RegExpSpecialChar extends RegExpTerm, TRegExpSpecialChar { - string char; - - RegExpSpecialChar() { - this = TRegExpSpecialChar(re, start, end) and - re.specialCharacter(start, end, char) + override string getPrimaryQLClass() { result = "RegExpAlt" } } /** - * Holds if this constant represents a valid Unicode character (as opposed - * to a surrogate code point that does not correspond to a character by itself.) + * An escaped regular expression term, that is, a regular expression + * term starting with a backslash, which is not a backreference. + * + * Example: + * + * ``` + * \. + * \w + * ``` */ - predicate isCharacter() { any() } + class RegExpEscape extends RegExpNormalChar { + RegExpEscape() { re.escapedCharacter(start, end) } - /** Gets the char for this term. */ - string getChar() { result = char } + /** + * Gets the name of the escaped; for example, `w` for `\w`. + * TODO: Handle named escapes. + */ + override string getValue() { + not this.isUnicode() and + this.isIdentityEscape() and + result = this.getUnescaped() + or + this.getUnescaped() = "n" and result = "\n" + or + this.getUnescaped() = "r" and result = "\r" + or + this.getUnescaped() = "t" and result = "\t" + or + this.getUnescaped() = "f" and result = 12.toUnicode() // form feed + or + this.getUnescaped() = "a" and result = 7.toUnicode() // alert/bell + or + this.getUnescaped() = "e" and result = 27.toUnicode() // escape (0x1B) + or + this.isUnicode() and + result = this.getUnicode() + } - override RegExpTerm getChild(int i) { none() } + /** Holds if this terms name is given by the part following the escape character. */ + predicate isIdentityEscape() { not this.getUnescaped() in ["n", "r", "t", "f", "a", "e"] } - override string getPrimaryQLClass() { result = "RegExpSpecialChar" } -} + override string getPrimaryQLClass() { result = "RegExpEscape" } -/** - * A dot regular expression. - * - * Example: - * - * ``` - * . - * ``` - */ -class RegExpDot extends RegExpSpecialChar { - RegExpDot() { this.getChar() = "." } + /** Gets the part of the term following the escape character. That is e.g. "w" if the term is "\w". */ + private string getUnescaped() { result = this.getText().suffix(1) } - override string getPrimaryQLClass() { result = "RegExpDot" } -} + /** + * Gets the text for this escape. That is e.g. "\w". + */ + private string getText() { result = re.getText().substring(start, end) } -/** - * A dollar assertion `$` matching the end of a line. - * - * Example: - * - * ``` - * $ - * ``` - */ -class RegExpDollar extends RegExpSpecialChar { - RegExpDollar() { this.getChar() = "$" } + /** + * Holds if this is a unicode escape. + */ + private predicate isUnicode() { this.getText().matches(["\\u%", "\\x%"]) } - override string getPrimaryQLClass() { result = "RegExpDollar" } -} + /** + * Gets the unicode char for this escape. + * E.g. for `\u0061` this returns "a". + */ + private string getUnicode() { + exists(int codepoint | codepoint = sum(this.getHexValueFromUnicode(_)) | + result = codepoint.toUnicode() + ) + } -/** - * A caret assertion `^` matching the beginning of a line. - * - * Example: - * - * ``` - * ^ - * ``` - */ -class RegExpCaret extends RegExpSpecialChar { - RegExpCaret() { this.getChar() = "^" } + /** Gets the part of this escape that is a hexidecimal string */ + private string getHexString() { + this.isUnicode() and + if this.getText().matches("\\u%") // \uhhhh + then result = this.getText().suffix(2) + else + if this.getText().matches("\\x{%") // \x{h..h} + then result = this.getText().substring(3, this.getText().length() - 1) + else result = this.getText().suffix(2) // \xhh + } - override string getPrimaryQLClass() { result = "RegExpCaret" } -} - -/** - * A zero-width match, that is, either an empty group or an assertion. - * - * Examples: - * ``` - * () - * (?=\w) - * ``` - */ -class RegExpZeroWidthMatch extends RegExpGroup { - RegExpZeroWidthMatch() { re.zeroWidthMatch(start, end) } - - override RegExpTerm getChild(int i) { none() } - - override string getPrimaryQLClass() { result = "RegExpZeroWidthMatch" } -} - -/** - * A zero-width lookahead or lookbehind assertion. - * - * Examples: - * - * ``` - * (?=\w) - * (?!\n) - * (?<=\.) - * (?` - * in a regular expression. - * - * Examples: - * - * ``` - * \1 - * (?P=quote) - * ``` - */ -class RegExpBackRef extends RegExpTerm, TRegExpBackRef { - RegExpBackRef() { this = TRegExpBackRef(re, start, end) } /** - * Gets the number of the capture group this back reference refers to, if any. + * A word boundary, that is, a regular expression term of the form `\b`. */ - int getNumber() { result = re.getBackrefNumber(start, end) } + class RegExpWordBoundary extends RegExpSpecialChar { + RegExpWordBoundary() { this.getChar() = "\\b" } + } /** - * Gets the name of the capture group this back reference refers to, if any. + * Gets the hex number for the `hex` char. */ - string getName() { result = re.getBackrefName(start, end) } - - /** Gets the capture group this back reference refers to. */ - RegExpGroup getGroup() { - result.isNumberedGroupOfLiteral(this.getLiteral(), this.getNumber()) + private int toHex(string hex) { + result = [0 .. 9] and hex = result.toString() or - result.isNamedGroupOfLiteral(this.getLiteral(), this.getName()) + result = 10 and hex = ["a", "A"] + or + result = 11 and hex = ["b", "B"] + or + result = 12 and hex = ["c", "C"] + or + result = 13 and hex = ["d", "D"] + or + result = 14 and hex = ["e", "E"] + or + result = 15 and hex = ["f", "F"] } - override RegExpTerm getChild(int i) { none() } + /** + * A character class escape in a regular expression. + * That is, an escaped character that denotes multiple characters. + * + * Examples: + * + * ``` + * \w + * \S + * ``` + */ + class RegExpCharacterClassEscape extends RegExpEscape { + RegExpCharacterClassEscape() { + this.getValue() in ["d", "D", "s", "S", "w", "W", "h", "H", "v", "V"] or + this.getValue().charAt(0) in ["p", "P"] + } - override string getPrimaryQLClass() { result = "RegExpBackRef" } + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" } + } + + /** + * A named character class in a regular expression. + * + * Examples: + * + * ``` + * \p{Digit} + * \p{IsLowerCase} + */ + class RegExpNamedProperty extends RegExpCharacterClassEscape { + boolean inverted; + string name; + + RegExpNamedProperty() { + name = this.getValue().substring(2, this.getValue().length() - 1) and + ( + inverted = false and + this.getValue().charAt(0) = "p" + or + inverted = true and + this.getValue().charAt(0) = "P" + ) + } + + /** Holds if this class is inverted. */ + predicate isInverted() { inverted = true } + + /** Gets the name of this class. */ + string getClassName() { result = name } + + /** + * Gets an equivalent single-chcracter escape sequence for this class (e.g. \d) if possible, excluding the escape character. + */ + string getBackslashEquivalent() { + exists(string eq | if inverted = true then result = eq.toUpperCase() else result = eq | + name = ["Digit", "IsDigit"] and + eq = "d" + or + name = ["Space", "IsWhite_Space"] and + eq = "s" + ) + } + } + + /** + * A character class in a regular expression. + * + * Examples: + * + * ``` + * [a-z_] + * [^<>&] + * ``` + */ + class RegExpCharacterClass extends RegExpTerm, TRegExpCharacterClass { + RegExpCharacterClass() { this = TRegExpCharacterClass(re, start, end) } + + /** Holds if this character class is inverted, matching the opposite of its content. */ + predicate isInverted() { re.getChar(start + 1) = "^" } + + /** Holds if this character class can match anything. */ + predicate isUniversalClass() { + // [^] + this.isInverted() and not exists(this.getAChild()) + or + // [\w\W] and similar + not this.isInverted() and + exists(string cce1, string cce2 | + cce1 = this.getAChild().(RegExpCharacterClassEscape).getValue() and + cce2 = this.getAChild().(RegExpCharacterClassEscape).getValue() + | + cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase() + ) + } + + override RegExpTerm getChild(int i) { + i = 0 and + exists(int itemStart, int itemEnd | + re.charSetStart(start, itemStart) and + re.charSetChild(start, itemStart, itemEnd) and + result.occursInRegex(re, itemStart, itemEnd) + ) + or + i > 0 and + exists(int itemStart, int itemEnd | itemStart = this.getChild(i - 1).getEnd() | + result.occursInRegex(re, itemStart, itemEnd) and + re.charSetChild(start, itemStart, itemEnd) + ) + } + + override string getPrimaryQLClass() { result = "RegExpCharacterClass" } + } + + /** + * A character range in a character class in a regular expression. + * + * Example: + * + * ``` + * a-z + * ``` + */ + class RegExpCharacterRange extends RegExpTerm, TRegExpCharacterRange { + int lower_end; + int upper_start; + + RegExpCharacterRange() { + this = TRegExpCharacterRange(re, start, end) and + re.charRange(_, start, lower_end, upper_start, end) + } + + /** Holds if this range goes from `lo` to `hi`, in effect is `lo-hi`. */ + predicate isRange(string lo, string hi) { + lo = re.getText().substring(start, lower_end) and + hi = re.getText().substring(upper_start, end) + } + + override RegExpTerm getChild(int i) { + i = 0 and + result.occursInRegex(re, start, lower_end) + or + i = 1 and + result.occursInRegex(re, upper_start, end) + } + + override string getPrimaryQLClass() { result = "RegExpCharacterRange" } + } + + /** + * A normal character in a regular expression, that is, a character + * without special meaning. This includes escaped characters. + * It also includes escape sequences that represent character classes. + * + * Examples: + * ``` + * t + * \t + * ``` + */ + class RegExpNormalChar extends RegExpTerm, TRegExpNormalChar { + RegExpNormalChar() { this = TRegExpNormalChar(re, start, end) } + + /** + * Holds if this constant represents a valid Unicode character (as opposed + * to a surrogate code point that does not correspond to a character by itself.) + */ + predicate isCharacter() { any() } + + /** Gets the string representation of the char matched by this term. */ + string getValue() { result = re.getText().substring(start, end) } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpNormalChar" } + } + + /** + * A quoted sequence. + * + * Example: + * ``` + * \Qabc\E + * ``` + */ + class RegExpQuote extends RegExpTerm, TRegExpQuote { + string value; + + RegExpQuote() { + exists(int inner_start, int inner_end | + this = TRegExpQuote(re, start, end) and + re.quote(start, end, inner_start, inner_end) and + value = re.getText().substring(inner_start, inner_end) + ) + } + + /** Gets the string matched by this quote term. */ + string getValue() { result = value } + + override string getPrimaryQLClass() { result = "RegExpQuote" } + } + + /** + * A constant regular expression term, that is, a regular expression + * term matching a single string. This can be a single character or a quoted sequence. + * + * Example: + * + * ``` + * a + * ``` + */ + class RegExpConstant extends RegExpTerm { + string value; + + RegExpConstant() { + (value = this.(RegExpNormalChar).getValue() or value = this.(RegExpQuote).getValue()) and + not this instanceof RegExpCharacterClassEscape + } + + /** + * Holds if this constant represents a valid Unicode character (as opposed + * to a surrogate code point that does not correspond to a character by itself.) + */ + predicate isCharacter() { any() } + + /** Gets the string matched by this constant term. */ + string getValue() { result = value } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpConstant" } + } + + /** + * A grouped regular expression. + * + * Examples: + * + * ``` + * (ECMA|Java) + * (?:ECMA|Java) + * (?['"]) + * ``` + */ + class RegExpGroup extends RegExpTerm, TRegExpGroup { + RegExpGroup() { this = TRegExpGroup(re, start, end) } + + /** + * Gets the index of this capture group within the enclosing regular + * expression literal. + * + * For example, in the regular expression `/((a?).)(?:b)/`, the + * group `((a?).)` has index 1, the group `(a?)` nested inside it + * has index 2, and the group `(?:b)` has no index, since it is + * not a capture group. + */ + int getNumber() { result = re.getGroupNumber(start, end) } + + /** Holds if this is a named capture group. */ + predicate isNamed() { exists(this.getName()) } + + /** Gets the name of this capture group, if any. */ + string getName() { result = re.getGroupName(start, end) } + + override RegExpTerm getChild(int i) { + i = 0 and + exists(int in_start, int in_end | re.groupContents(start, end, in_start, in_end) | + result.occursInRegex(re, in_start, in_end) + ) + } + + override string getPrimaryQLClass() { result = "RegExpGroup" } + + /** Holds if this is the `n`th numbered group of literal `lit`. */ + predicate isNumberedGroupOfLiteral(RegExpLiteral lit, int n) { + lit = this.getLiteral() and n = this.getNumber() + } + + /** Holds if this is a group with name `name` of literal `lit`. */ + predicate isNamedGroupOfLiteral(RegExpLiteral lit, string name) { + lit = this.getLiteral() and name = this.getName() + } + } + + /** + * A special character in a regular expression. + * + * Examples: + * ``` + * ^ + * $ + * . + * ``` + */ + class RegExpSpecialChar extends RegExpTerm, TRegExpSpecialChar { + string char; + + RegExpSpecialChar() { + this = TRegExpSpecialChar(re, start, end) and + re.specialCharacter(start, end, char) + } + + /** + * Holds if this constant represents a valid Unicode character (as opposed + * to a surrogate code point that does not correspond to a character by itself.) + */ + predicate isCharacter() { any() } + + /** Gets the char for this term. */ + string getChar() { result = char } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpSpecialChar" } + } + + /** + * A dot regular expression. + * + * Example: + * + * ``` + * . + * ``` + */ + class RegExpDot extends RegExpSpecialChar { + RegExpDot() { this.getChar() = "." } + + override string getPrimaryQLClass() { result = "RegExpDot" } + } + + /** + * A dollar assertion `$` matching the end of a line. + * + * Example: + * + * ``` + * $ + * ``` + */ + class RegExpDollar extends RegExpSpecialChar { + RegExpDollar() { this.getChar() = "$" } + + override string getPrimaryQLClass() { result = "RegExpDollar" } + } + + /** + * A caret assertion `^` matching the beginning of a line. + * + * Example: + * + * ``` + * ^ + * ``` + */ + class RegExpCaret extends RegExpSpecialChar { + RegExpCaret() { this.getChar() = "^" } + + override string getPrimaryQLClass() { result = "RegExpCaret" } + } + + /** + * A zero-width match, that is, either an empty group or an assertion. + * + * Examples: + * ``` + * () + * (?=\w) + * ``` + */ + class RegExpZeroWidthMatch extends RegExpGroup { + RegExpZeroWidthMatch() { re.zeroWidthMatch(start, end) } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpZeroWidthMatch" } + } + + /** + * A zero-width lookahead or lookbehind assertion. + * + * Examples: + * + * ``` + * (?=\w) + * (?!\n) + * (?<=\.) + * (?` + * in a regular expression. + * + * Examples: + * + * ``` + * \1 + * (?P=quote) + * ``` + */ + class RegExpBackRef extends RegExpTerm, TRegExpBackRef { + RegExpBackRef() { this = TRegExpBackRef(re, start, end) } + + /** + * Gets the number of the capture group this back reference refers to, if any. + */ + int getNumber() { result = re.getBackrefNumber(start, end) } + + /** + * Gets the name of the capture group this back reference refers to, if any. + */ + string getName() { result = re.getBackrefName(start, end) } + + /** Gets the capture group this back reference refers to. */ + RegExpGroup getGroup() { + result.isNumberedGroupOfLiteral(this.getLiteral(), this.getNumber()) + or + result.isNamedGroupOfLiteral(this.getLiteral(), this.getName()) + } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpBackRef" } + } + + class Top = RegExpParent; + + /** + * Holds if `term` is an escape class representing e.g. `\d`. + * `clazz` is which character class it represents, e.g. "d" for `\d`. + */ + predicate isEscapeClass(RegExpTerm term, string clazz) { + term.(RegExpCharacterClassEscape).getValue() = clazz + or + term.(RegExpNamedProperty).getBackslashEquivalent() = clazz + } + + /** + * Holds if `term` is a possessive quantifier, e.g. `a*+`. + */ + predicate isPossessive(RegExpQuantifier term) { term.isPossessive() } + + /** + * Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against. + */ + predicate matchesAnyPrefix(RegExpTerm term) { not term.getRegex().matchesFullString() } + + /** + * Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against. + */ + predicate matchesAnySuffix(RegExpTerm term) { not term.getRegex().matchesFullString() } + + /** + * Holds if the regular expression should not be considered. + * + * We make the pragmatic performance optimization to ignore regular expressions in files + * that do not belong to the project code (such as installed dependencies). + */ + predicate isExcluded(RegExpParent parent) { + not exists(parent.getRegex().getLocation().getFile().getRelativePath()) + or + // Regexes with many occurrences of ".*" may cause the polynomial ReDoS computation to explode, so + // we explicitly exclude these. + strictcount(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10 + } + + /** + * Holds if `root` has the `i` flag for case-insensitive matching. + */ + predicate isIgnoreCase(RegExpTerm root) { + root.isRootTerm() and + root.getLiteral().isIgnoreCase() + } + + /** + * Gets the flags for `root`, or the empty string if `root` has no flags. + */ + deprecated string getFlags(RegExpTerm root) { + root.isRootTerm() and + result = root.getLiteral().getFlags() + } + + /** + * Holds if `root` has the `s` flag for multi-line matching. + */ + predicate isDotAll(RegExpTerm root) { + root.isRootTerm() and + root.getLiteral().isDotAll() + } } - -/** Gets the parse tree resulting from parsing `re`, if such has been constructed. */ -RegExpTerm getParsedRegExp(StringLiteral re) { result.getRegex() = re and result.isRootTerm() } diff --git a/java/ql/lib/semmle/code/java/security/OverlyLargeRangeQuery.qll b/java/ql/lib/semmle/code/java/security/OverlyLargeRangeQuery.qll index 65e662f0bc5..06b538d4a63 100644 --- a/java/ql/lib/semmle/code/java/security/OverlyLargeRangeQuery.qll +++ b/java/ql/lib/semmle/code/java/security/OverlyLargeRangeQuery.qll @@ -2,288 +2,7 @@ * Classes and predicates for working with suspicious character ranges. */ -// We don't need the NFA utils, just the regexp tree. -// but the below is a nice shared library that exposes the API we need. -import regexp.NfaUtils - -/** - * Gets a rank for `range` that is unique for ranges in the same file. - * Prioritizes ranges that match more characters. - */ -int rankRange(RegExpCharacterRange range) { - range = - rank[result](RegExpCharacterRange r, Location l, int low, int high | - r.getLocation() = l and - isRange(r, low, high) - | - r order by (high - low) desc, l.getStartLine(), l.getStartColumn() - ) -} - -/** Holds if `range` spans from the unicode code points `low` to `high` (both inclusive). */ -predicate isRange(RegExpCharacterRange range, int low, int high) { - exists(string lowc, string highc | - range.isRange(lowc, highc) and - low.toUnicode() = lowc and - high.toUnicode() = highc - ) -} - -/** Holds if `char` is an alpha-numeric character. */ -predicate isAlphanumeric(string char) { - // written like this to avoid having a bindingset for the predicate - char = [[48 .. 57], [65 .. 90], [97 .. 122]].toUnicode() // 0-9, A-Z, a-z -} - -/** - * Holds if the given ranges are from the same character class - * and there exists at least one character matched by both ranges. - */ -predicate overlap(RegExpCharacterRange a, RegExpCharacterRange b) { - exists(RegExpCharacterClass clz | - a = clz.getAChild() and - b = clz.getAChild() and - a != b - | - exists(int alow, int ahigh, int blow, int bhigh | - isRange(a, alow, ahigh) and - isRange(b, blow, bhigh) and - alow <= bhigh and - blow <= ahigh - ) - ) -} - -/** - * Holds if `range` overlaps with the char class `escape` from the same character class. - */ -predicate overlapsWithCharEscape(RegExpCharacterRange range, RegExpCharacterClassEscape escape) { - exists(RegExpCharacterClass clz, string low, string high | - range = clz.getAChild() and - escape = clz.getAChild() and - range.isRange(low, high) - | - escape.getValue() = "w" and - getInRange(low, high).regexpMatch("\\w") - or - escape.getValue() = "d" and - getInRange(low, high).regexpMatch("\\d") - or - escape.getValue() = "s" and - getInRange(low, high).regexpMatch("\\s") - ) -} - -/** Gets the unicode code point for a `char`. */ -bindingset[char] -int toCodePoint(string char) { result.toUnicode() = char } - -/** A character range that appears to be overly wide. */ -class OverlyWideRange extends RegExpCharacterRange { - OverlyWideRange() { - exists(int low, int high, int numChars | - isRange(this, low, high) and - numChars = (1 + high - low) and - this.getRootTerm().isUsedAsRegExp() and - numChars >= 10 - | - // across the Z-a range (which includes backticks) - toCodePoint("Z") >= low and - toCodePoint("a") <= high - or - // across the 9-A range (which includes e.g. ; and ?) - toCodePoint("9") >= low and - toCodePoint("A") <= high - or - // a non-alphanumeric char as part of the range boundaries - exists(int bound | bound = [low, high] | not isAlphanumeric(bound.toUnicode())) and - // while still being ascii - low < 128 and - high < 128 - ) and - // allowlist for known ranges - not this = allowedWideRanges() - } - - /** Gets a string representation of a character class that matches the same chars as this range. */ - string printEquivalent() { result = RangePrinter::printEquivalentCharClass(this) } -} - -/** Gets a range that should not be reported as an overly wide range. */ -RegExpCharacterRange allowedWideRanges() { - // ~ is the last printable ASCII character, it's used right in various wide ranges. - result.isRange(_, "~") - or - // the same with " " and "!". " " is the first printable character, and "!" is the first non-white-space printable character. - result.isRange([" ", "!"], _) - or - // the `[@-_]` range is intentional - result.isRange("@", "_") - or - // starting from the zero byte is a good indication that it's purposely matching a large range. - result.isRange(0.toUnicode(), _) -} - -/** Gets a char between (and including) `low` and `high`. */ -bindingset[low, high] -private string getInRange(string low, string high) { - result = [toCodePoint(low) .. toCodePoint(high)].toUnicode() -} - -/** A module computing an equivalent character class for an overly wide range. */ -module RangePrinter { - bindingset[char] - bindingset[result] - private string next(string char) { - exists(int prev, int next | - prev.toUnicode() = char and - next.toUnicode() = result and - next = prev + 1 - ) - } - - /** Gets the points where the parts of the pretty printed range should be cut off. */ - private string cutoffs() { result = ["A", "Z", "a", "z", "0", "9"] } - - /** Gets the char to use in the low end of a range for a given `cut` */ - private string lowCut(string cut) { - cut = ["A", "a", "0"] and - result = cut - or - cut = ["Z", "z", "9"] and - result = next(cut) - } - - /** Gets the char to use in the high end of a range for a given `cut` */ - private string highCut(string cut) { - cut = ["Z", "z", "9"] and - result = cut - or - cut = ["A", "a", "0"] and - next(result) = cut - } - - /** Gets the cutoff char used for a given `part` of a range when pretty-printing it. */ - private string cutoff(OverlyWideRange range, int part) { - exists(int low, int high | isRange(range, low, high) | - result = - rank[part + 1](string cut | - cut = cutoffs() and low < toCodePoint(cut) and toCodePoint(cut) < high - | - cut order by toCodePoint(cut) - ) - ) - } - - /** Gets the number of parts we should print for a given `range`. */ - private int parts(OverlyWideRange range) { result = 1 + count(cutoff(range, _)) } - - /** Holds if the given part of a range should span from `low` to `high`. */ - private predicate part(OverlyWideRange range, int part, string low, string high) { - // first part. - part = 0 and - ( - range.isRange(low, high) and - parts(range) = 1 - or - parts(range) >= 2 and - range.isRange(low, _) and - high = highCut(cutoff(range, part)) - ) - or - // middle - part >= 1 and - part < parts(range) - 1 and - low = lowCut(cutoff(range, part - 1)) and - high = highCut(cutoff(range, part)) - or - // last. - part = parts(range) - 1 and - low = lowCut(cutoff(range, part - 1)) and - range.isRange(_, high) - } - - /** Gets an escaped `char` for use in a character class. */ - bindingset[char] - private string escape(string char) { - exists(string reg | reg = "(\\[|\\]|\\\\|-|/)" | - if char.regexpMatch(reg) then result = "\\" + char else result = char - ) - } - - /** Gets a part of the equivalent range. */ - private string printEquivalentCharClass(OverlyWideRange range, int part) { - exists(string low, string high | part(range, part, low, high) | - if - isAlphanumeric(low) and - isAlphanumeric(high) - then result = low + "-" + high - else - result = - strictconcat(string char | char = getInRange(low, high) | escape(char) order by char) - ) - } - - /** Gets the entire pretty printed equivalent range. */ - string printEquivalentCharClass(OverlyWideRange range) { - result = - strictconcat(string r, int part | - r = "[" and part = -1 and exists(range) - or - r = printEquivalentCharClass(range, part) - or - r = "]" and part = parts(range) - | - r order by part - ) - } -} - -/** Gets a char range that is overly large because of `reason`. */ -RegExpCharacterRange getABadRange(string reason, int priority) { - result instanceof OverlyWideRange and - priority = 0 and - exists(string equiv | equiv = result.(OverlyWideRange).printEquivalent() | - if equiv.length() <= 50 - then reason = "is equivalent to " + equiv - else reason = "is equivalent to " + equiv.substring(0, 50) + "..." - ) - or - priority = 1 and - exists(RegExpCharacterRange other | - reason = "overlaps with " + other + " in the same character class" and - rankRange(result) < rankRange(other) and - overlap(result, other) - ) - or - priority = 2 and - exists(RegExpCharacterClassEscape escape | - reason = "overlaps with " + escape + " in the same character class" and - overlapsWithCharEscape(result, escape) - ) - or - reason = "is empty" and - priority = 3 and - exists(int low, int high | - isRange(result, low, high) and - low > high - ) -} - -/** Holds if `range` matches suspiciously many characters. */ -predicate problem(RegExpCharacterRange range, string reason) { - reason = - strictconcat(string m, int priority | - range = getABadRange(m, priority) - | - m, ", and " order by priority desc - ) and - // specifying a range using an escape is usually OK. - not range.getAChild() instanceof RegExpEscape and - // Unicode escapes in strings are interpreted before it turns into a regexp, - // so e.g. [\u0001-\uFFFF] will just turn up as a range between two constants. - // We therefore exclude these ranges. - range.getRootTerm().getParent() instanceof RegExpLiteral and - // is used as regexp (mostly for JS where regular expressions are parsed eagerly) - range.getRootTerm().isUsedAsRegExp() -} +private import semmle.code.java.regex.RegexTreeView::RegexTreeView as TreeView +// OverlyLargeRangeQuery should be used directly from the shared pack, and not from this file. +deprecated import codeql.regex.OverlyLargeRangeQuery::Makeas Dep +import Dep diff --git a/java/ql/lib/semmle/code/java/security/regexp/ExponentialBackTracking.qll b/java/ql/lib/semmle/code/java/security/regexp/ExponentialBackTracking.qll index 4a608890249..d0a08dc88bf 100644 --- a/java/ql/lib/semmle/code/java/security/regexp/ExponentialBackTracking.qll +++ b/java/ql/lib/semmle/code/java/security/regexp/ExponentialBackTracking.qll @@ -62,284 +62,7 @@ * a suffix `x` (possible empty) that is most likely __not__ accepted. */ -import NfaUtils - -/** - * Holds if state `s` might be inside a backtracking repetition. - */ -pragma[noinline] -private predicate stateInsideBacktracking(State s) { - s.getRepr().getParent*() instanceof MaybeBacktrackingRepetition -} - -/** - * A infinitely repeating quantifier that might backtrack. - */ -private class MaybeBacktrackingRepetition extends InfiniteRepetitionQuantifier { - MaybeBacktrackingRepetition() { - exists(RegExpTerm child | - child instanceof RegExpAlt or - child instanceof RegExpQuantifier - | - child.getParent+() = this - ) - } -} - -/** - * A state in the product automaton. - */ -private newtype TStatePair = - /** - * We lazily only construct those states that we are actually - * going to need: `(q, q)` for every fork state `q`, and any - * pair of states that can be reached from a pair that we have - * already constructed. To cut down on the number of states, - * we only represent states `(q1, q2)` where `q1` is lexicographically - * no bigger than `q2`. - * - * States are only constructed if both states in the pair are - * inside a repetition that might backtrack. - */ - MkStatePair(State q1, State q2) { - isFork(q1, _, _, _, _) and q2 = q1 - or - (step(_, _, _, q1, q2) or step(_, _, _, q2, q1)) and - rankState(q1) <= rankState(q2) - } - -/** - * Gets a unique number for a `state`. - * Is used to create an ordering of states, where states with the same `toString()` will be ordered differently. - */ -private int rankState(State state) { - state = - rank[result](State s, Location l | - stateInsideBacktracking(s) and - l = s.getRepr().getLocation() - | - s order by l.getStartLine(), l.getStartColumn(), s.toString() - ) -} - -/** - * A state in the product automaton. - */ -private class StatePair extends TStatePair { - State q1; - State q2; - - StatePair() { this = MkStatePair(q1, q2) } - - /** Gets a textual representation of this element. */ - string toString() { result = "(" + q1 + ", " + q2 + ")" } - - /** Gets the first component of the state pair. */ - State getLeft() { result = q1 } - - /** Gets the second component of the state pair. */ - State getRight() { result = q2 } -} - -/** - * Holds for `(fork, fork)` state pairs when `isFork(fork, _, _, _, _)` holds. - * - * Used in `statePairDistToFork` - */ -private predicate isStatePairFork(StatePair p) { - exists(State fork | p = MkStatePair(fork, fork) and isFork(fork, _, _, _, _)) -} - -/** - * Holds if there are transitions from the components of `q` to the corresponding - * components of `r`. - * - * Used in `statePairDistToFork` - */ -private predicate reverseStep(StatePair r, StatePair q) { step(q, _, _, r) } - -/** - * Gets the minimum length of a path from `q` to `r` in the - * product automaton. - */ -private int statePairDistToFork(StatePair q, StatePair r) = - shortestDistances(isStatePairFork/1, reverseStep/2)(r, q, result) - -/** - * Holds if there are transitions from `q` to `r1` and from `q` to `r2` - * labelled with `s1` and `s2`, respectively, where `s1` and `s2` do not - * trivially have an empty intersection. - * - * This predicate only holds for states associated with regular expressions - * that have at least one repetition quantifier in them (otherwise the - * expression cannot be vulnerable to ReDoS attacks anyway). - */ -pragma[noopt] -private predicate isFork(State q, InputSymbol s1, InputSymbol s2, State r1, State r2) { - stateInsideBacktracking(q) and - exists(State q1, State q2 | - q1 = epsilonSucc*(q) and - delta(q1, s1, r1) and - q2 = epsilonSucc*(q) and - delta(q2, s2, r2) and - // Use pragma[noopt] to prevent intersect(s1,s2) from being the starting point of the join. - // From (s1,s2) it would find a huge number of intermediate state pairs (q1,q2) originating from different literals, - // and discover at the end that no `q` can reach both `q1` and `q2` by epsilon transitions. - exists(intersect(s1, s2)) - | - s1 != s2 - or - r1 != r2 - or - r1 = r2 and q1 != q2 - or - // If q can reach itself by epsilon transitions, then there are two distinct paths to the q1/q2 state: - // one that uses the loop and one that doesn't. The engine will separately attempt to match with each path, - // despite ending in the same state. The "fork" thus arises from the choice of whether to use the loop or not. - // To avoid every state in the loop becoming a fork state, - // we arbitrarily pick the InfiniteRepetitionQuantifier state as the canonical fork state for the loop - // (every epsilon-loop must contain such a state). - // - // We additionally require that the there exists another InfiniteRepetitionQuantifier `mid` on the path from `q` to itself. - // This is done to avoid flagging regular expressions such as `/(a?)*b/` - that only has polynomial runtime, and is detected by `js/polynomial-redos`. - // The below code is therefore a heuristic, that only flags regular expressions such as `/(a*)*b/`, - // and does not flag regular expressions such as `/(a?b?)c/`, but the latter pattern is not used frequently. - r1 = r2 and - q1 = q2 and - epsilonSucc+(q) = q and - exists(RegExpTerm term | term = q.getRepr() | term instanceof InfiniteRepetitionQuantifier) and - // One of the mid states is an infinite quantifier itself - exists(State mid, RegExpTerm term | - mid = epsilonSucc+(q) and - term = mid.getRepr() and - term instanceof InfiniteRepetitionQuantifier and - q = epsilonSucc+(mid) and - not mid = q - ) - ) and - stateInsideBacktracking(r1) and - stateInsideBacktracking(r2) -} - -/** - * Gets the state pair `(q1, q2)` or `(q2, q1)`; note that only - * one or the other is defined. - */ -private StatePair mkStatePair(State q1, State q2) { - result = MkStatePair(q1, q2) or result = MkStatePair(q2, q1) -} - -/** - * Holds if there are transitions from the components of `q` to the corresponding - * components of `r` labelled with `s1` and `s2`, respectively. - */ -private predicate step(StatePair q, InputSymbol s1, InputSymbol s2, StatePair r) { - exists(State r1, State r2 | step(q, s1, s2, r1, r2) and r = mkStatePair(r1, r2)) -} - -/** - * Holds if there are transitions from the components of `q` to `r1` and `r2` - * labelled with `s1` and `s2`, respectively. - * - * We only consider transitions where the resulting states `(r1, r2)` are both - * inside a repetition that might backtrack. - */ -pragma[noopt] -private predicate step(StatePair q, InputSymbol s1, InputSymbol s2, State r1, State r2) { - exists(State q1, State q2 | q.getLeft() = q1 and q.getRight() = q2 | - deltaClosed(q1, s1, r1) and - deltaClosed(q2, s2, r2) and - // use noopt to force the join on `intersect` to happen last. - exists(intersect(s1, s2)) - ) and - stateInsideBacktracking(r1) and - stateInsideBacktracking(r2) -} - -private newtype TTrace = - Nil() or - Step(InputSymbol s1, InputSymbol s2, TTrace t) { isReachableFromFork(_, _, s1, s2, t, _) } - -/** - * A list of pairs of input symbols that describe a path in the product automaton - * starting from some fork state. - */ -private class Trace extends TTrace { - /** Gets a textual representation of this element. */ - string toString() { - this = Nil() and result = "Nil()" - or - exists(InputSymbol s1, InputSymbol s2, Trace t | this = Step(s1, s2, t) | - result = "Step(" + s1 + ", " + s2 + ", " + t + ")" - ) - } -} - -/** - * Holds if `r` is reachable from `(fork, fork)` under input `w`, and there is - * a path from `r` back to `(fork, fork)` with `rem` steps. - */ -private predicate isReachableFromFork(State fork, StatePair r, Trace w, int rem) { - exists(InputSymbol s1, InputSymbol s2, Trace v | - isReachableFromFork(fork, r, s1, s2, v, rem) and - w = Step(s1, s2, v) - ) -} - -private predicate isReachableFromFork( - State fork, StatePair r, InputSymbol s1, InputSymbol s2, Trace v, int rem -) { - // base case - exists(State q1, State q2 | - isFork(fork, s1, s2, q1, q2) and - r = MkStatePair(q1, q2) and - v = Nil() and - rem = statePairDistToFork(r, MkStatePair(fork, fork)) - ) - or - // recursive case - exists(StatePair p | - isReachableFromFork(fork, p, v, rem + 1) and - step(p, s1, s2, r) and - rem = statePairDistToFork(r, MkStatePair(fork, fork)) - ) -} - -/** - * Gets a state in the product automaton from which `(fork, fork)` is - * reachable in zero or more epsilon transitions. - */ -private StatePair getAForkPair(State fork) { - isFork(fork, _, _, _, _) and - result = MkStatePair(epsilonPred*(fork), epsilonPred*(fork)) -} - -/** An implementation of a chain containing chars for use by `Concretizer`. */ -private module CharTreeImpl implements CharTree { - class CharNode = Trace; - - CharNode getPrev(CharNode t) { t = Step(_, _, result) } - - /** Holds if `n` is a trace that is used by `concretize` in `isPumpable`. */ - predicate isARelevantEnd(CharNode n) { - exists(State f | isReachableFromFork(f, getAForkPair(f), n, _)) - } - - string getChar(CharNode t) { - exists(InputSymbol s1, InputSymbol s2 | t = Step(s1, s2, _) | result = intersect(s1, s2)) - } -} - -/** - * Holds if `fork` is a pumpable fork with word `w`. - */ -private predicate isPumpable(State fork, string w) { - exists(StatePair q, Trace t | - isReachableFromFork(fork, q, t, _) and - q = getAForkPair(fork) and - w = Concretizer ::concretize(t) - ) -} - -/** Holds if `state` has exponential ReDoS */ -predicate hasReDoSResult = ReDoSPruning ::hasReDoSResult/4; +private import semmle.code.java.regex.RegexTreeView::RegexTreeView as TreeView +// ExponentialBackTracking should be used directly from the shared pack, and not from this file. +deprecated private import codeql.regex.nfa.ExponentialBackTracking::Make as Dep +import Dep diff --git a/java/ql/lib/semmle/code/java/security/regexp/NfaUtils.qll b/java/ql/lib/semmle/code/java/security/regexp/NfaUtils.qll index 5ff0cb6a39e..3b69ecc7120 100644 --- a/java/ql/lib/semmle/code/java/security/regexp/NfaUtils.qll +++ b/java/ql/lib/semmle/code/java/security/regexp/NfaUtils.qll @@ -7,1332 +7,7 @@ * other queries that benefit from reasoning about NFAs. */ -import NfaUtilsSpecific - -/** - * Gets the char after `c` (from a simplified ASCII table). - */ -private string nextChar(string c) { exists(int code | code = ascii(c) | code + 1 = ascii(result)) } - -/** - * Gets an approximation for the ASCII code for `char`. - * Only the easily printable chars are included (so no newline, tab, null, etc). - */ -private int ascii(string char) { - char = - rank[result](string c | - c = - "! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" - .charAt(_) - ) -} - -/** - * Holds if `t` matches at least an epsilon symbol. - * - * That is, this term does not restrict the language of the enclosing regular expression. - * - * This is implemented as an under-approximation, and this predicate does not hold for sub-patterns in particular. - */ -predicate matchesEpsilon(RegExpTerm t) { - t instanceof RegExpStar - or - t instanceof RegExpOpt - or - t.(RegExpRange).getLowerBound() = 0 - or - exists(RegExpTerm child | - child = t.getAChild() and - matchesEpsilon(child) - | - t instanceof RegExpAlt or - t instanceof RegExpGroup or - t instanceof RegExpPlus or - t instanceof RegExpRange - ) - or - matchesEpsilon(t.(RegExpBackRef).getGroup()) - or - forex(RegExpTerm child | child = t.(RegExpSequence).getAChild() | matchesEpsilon(child)) -} - -/** - * A lookahead/lookbehind that matches the empty string. - */ -class EmptyPositiveSubPattern extends RegExpSubPattern { - EmptyPositiveSubPattern() { - ( - this instanceof RegExpPositiveLookahead - or - this instanceof RegExpPositiveLookbehind - ) and - matchesEpsilon(this.getOperand()) - } -} - -/** DEPRECATED: Use `EmptyPositiveSubPattern` instead. */ -deprecated class EmptyPositiveSubPatttern = EmptyPositiveSubPattern; - -/** - * A branch in a disjunction that is the root node in a literal, or a literal - * whose root node is not a disjunction. - */ -class RegExpRoot extends RegExpTerm { - RegExpRoot() { - exists(RegExpParent parent | - exists(RegExpAlt alt | - alt.isRootTerm() and - this = alt.getAChild() and - parent = alt.getParent() - ) - or - this.isRootTerm() and - not this instanceof RegExpAlt and - parent = this.getParent() - ) - } - - /** - * Holds if this root term is relevant to the ReDoS analysis. - */ - predicate isRelevant() { - // is actually used as a RegExp - this.isUsedAsRegExp() and - // not excluded for library specific reasons - not isExcluded(this.getRootTerm().getParent()) - } -} - -/** - * A constant in a regular expression that represents valid Unicode character(s). - */ -private class RegexpCharacterConstant extends RegExpConstant { - RegexpCharacterConstant() { this.isCharacter() } -} - -/** - * A regexp term that is relevant for this ReDoS analysis. - */ -class RelevantRegExpTerm extends RegExpTerm { - RelevantRegExpTerm() { getRoot(this).isRelevant() } -} - -/** - * Holds if `term` is the chosen canonical representative for all terms with string representation `str`. - * The string representation includes which flags are used with the regular expression. - * - * Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s. - * The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks. - */ -private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) { - term = - min(RelevantRegExpTerm t, Location loc, File file | - loc = t.getLocation() and - file = t.getFile() and - str = getCanonicalizationString(t) - | - t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn() - ) -} - -/** - * Gets a string representation of `term` that is used for canonicalization. - */ -private string getCanonicalizationString(RelevantRegExpTerm term) { - exists(string ignoreCase | - (if RegExpFlags::isIgnoreCase(term.getRootTerm()) then ignoreCase = "i" else ignoreCase = "") and - result = term.getRawValue() + "|" + ignoreCase - ) -} - -/** - * An abstract input symbol, representing a set of concrete characters. - */ -private newtype TInputSymbol = - /** An input symbol corresponding to character `c`. */ - Char(string c) { - c = - any(RegexpCharacterConstant cc | - cc instanceof RelevantRegExpTerm and - not RegExpFlags::isIgnoreCase(cc.getRootTerm()) - ).getValue().charAt(_) - or - // normalize everything to lower case if the regexp is case insensitive - c = - any(RegexpCharacterConstant cc, string char | - cc instanceof RelevantRegExpTerm and - RegExpFlags::isIgnoreCase(cc.getRootTerm()) and - char = cc.getValue().charAt(_) - | - char.toLowerCase() - ) - } or - /** - * An input symbol representing all characters matched by - * a (non-universal) character class that has string representation `charClassString`. - */ - CharClass(string charClassString) { - exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) | - recc instanceof RegExpCharacterClass and - not recc.(RegExpCharacterClass).isUniversalClass() - or - isEscapeClass(recc, _) - ) - } or - /** An input symbol representing all characters matched by `.`. */ - Dot() or - /** An input symbol representing all characters. */ - Any() or - /** An epsilon transition in the automaton. */ - Epsilon() - -/** - * Gets the the CharClass corresponding to the canonical representative `term`. - */ -private CharClass getCharClassForCanonicalTerm(RegExpTerm term) { - exists(string str | isCanonicalTerm(term, str) | result = CharClass(str)) -} - -/** - * Gets a char class that represents `term`, even when `term` is not the canonical representative. - */ -CharacterClass getCanonicalCharClass(RegExpTerm term) { - exists(string str | str = getCanonicalizationString(term) and result = CharClass(str)) -} - -/** - * Holds if `a` and `b` are input symbols from the same regexp. - */ -private predicate sharesRoot(InputSymbol a, InputSymbol b) { - exists(RegExpRoot root | - belongsTo(a, root) and - belongsTo(b, root) - ) -} - -/** - * Holds if the `a` is an input symbol from a regexp that has root `root`. - */ -private predicate belongsTo(InputSymbol a, RegExpRoot root) { - exists(State s | getRoot(s.getRepr()) = root | - delta(s, a, _) - or - delta(_, a, s) - ) -} - -/** - * An abstract input symbol, representing a set of concrete characters. - */ -class InputSymbol extends TInputSymbol { - InputSymbol() { not this instanceof Epsilon } - - /** - * Gets a string representation of this input symbol. - */ - string toString() { - this = Char(result) - or - this = CharClass(result) - or - this = Dot() and result = "." - or - this = Any() and result = "[^]" - } -} - -/** - * An abstract input symbol that represents a character class. - */ -abstract class CharacterClass extends InputSymbol { - /** - * Gets a character that is relevant for intersection-tests involving this - * character class. - * - * Specifically, this is any of the characters mentioned explicitly in the - * character class, offset by one if it is inverted. For character class escapes, - * the result is as if the class had been written out as a series of intervals. - * - * This set is large enough to ensure that for any two intersecting character - * classes, one contains a relevant character from the other. - */ - abstract string getARelevantChar(); - - /** - * Holds if this character class matches `char`. - */ - bindingset[char] - abstract predicate matches(string char); - - /** - * Gets a character matched by this character class. - */ - string choose() { result = this.getARelevantChar() and this.matches(result) } -} - -/** - * Provides implementations for `CharacterClass`. - */ -private module CharacterClasses { - /** - * Holds if the character class `cc` has a child (constant or range) that matches `char`. - */ - pragma[noinline] - predicate hasChildThatMatches(RegExpCharacterClass cc, string char) { - if RegExpFlags::isIgnoreCase(cc.getRootTerm()) - then - // normalize everything to lower case if the regexp is case insensitive - exists(string c | hasChildThatMatchesIgnoringCasingFlags(cc, c) | char = c.toLowerCase()) - else hasChildThatMatchesIgnoringCasingFlags(cc, char) - } - - /** - * Holds if the character class `cc` has a child (constant or range) that matches `char`. - * Ignores whether the character class is inside a regular expression that has the ignore case flag. - */ - pragma[noinline] - predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) { - exists(getCharClassForCanonicalTerm(cc)) and - exists(RegExpTerm child | child = cc.getAChild() | - char = child.(RegexpCharacterConstant).getValue() - or - rangeMatchesOnLetterOrDigits(child, char) - or - not rangeMatchesOnLetterOrDigits(child, _) and - char = getARelevantChar() and - exists(string lo, string hi | child.(RegExpCharacterRange).isRange(lo, hi) | - lo <= char and - char <= hi - ) - or - exists(string charClass | isEscapeClass(child, charClass) | - charClass.toLowerCase() = charClass and - classEscapeMatches(charClass, char) - or - char = getARelevantChar() and - charClass.toUpperCase() = charClass and - not classEscapeMatches(charClass, char) - ) - ) - } - - /** - * Holds if `range` is a range on lower-case, upper-case, or digits, and matches `char`. - * This predicate is used to restrict the searchspace for ranges by only joining `getAnyPossiblyMatchedChar` - * on a few ranges. - */ - private predicate rangeMatchesOnLetterOrDigits(RegExpCharacterRange range, string char) { - exists(string lo, string hi | - range.isRange(lo, hi) and lo = lowercaseLetter() and hi = lowercaseLetter() - | - lo <= char and - char <= hi and - char = lowercaseLetter() - ) - or - exists(string lo, string hi | - range.isRange(lo, hi) and lo = upperCaseLetter() and hi = upperCaseLetter() - | - lo <= char and - char <= hi and - char = upperCaseLetter() - ) - or - exists(string lo, string hi | range.isRange(lo, hi) and lo = digit() and hi = digit() | - lo <= char and - char <= hi and - char = digit() - ) - } - - private string lowercaseLetter() { result = "abcdefghijklmnopqrstuvwxyz".charAt(_) } - - private string upperCaseLetter() { result = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".charAt(_) } - - private string digit() { result = [0 .. 9].toString() } - - /** - * Gets a char that could be matched by a regular expression. - * Includes all printable ascii chars, all constants mentioned in a regexp, and all chars matches by the regexp `/\s|\d|\w/`. - */ - string getARelevantChar() { - exists(ascii(result)) - or - exists(RegexpCharacterConstant c | result = c.getValue().charAt(_)) - or - classEscapeMatches(_, result) - } - - /** - * Gets a char that is mentioned in the character class `c`. - */ - private string getAMentionedChar(RegExpCharacterClass c) { - exists(RegExpTerm child | child = c.getAChild() | - result = child.(RegexpCharacterConstant).getValue() - or - child.(RegExpCharacterRange).isRange(result, _) - or - child.(RegExpCharacterRange).isRange(_, result) - or - exists(string charClass | isEscapeClass(child, charClass) | - result = min(string s | classEscapeMatches(charClass.toLowerCase(), s)) - or - result = max(string s | classEscapeMatches(charClass.toLowerCase(), s)) - ) - ) - } - - bindingset[char, cc] - private string caseNormalize(string char, RegExpTerm cc) { - if RegExpFlags::isIgnoreCase(cc.getRootTerm()) - then result = char.toLowerCase() - else result = char - } - - /** - * An implementation of `CharacterClass` for positive (non inverted) character classes. - */ - private class PositiveCharacterClass extends CharacterClass { - RegExpCharacterClass cc; - - PositiveCharacterClass() { this = getCharClassForCanonicalTerm(cc) and not cc.isInverted() } - - override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) } - - override predicate matches(string char) { hasChildThatMatches(cc, char) } - } - - /** - * An implementation of `CharacterClass` for inverted character classes. - */ - private class InvertedCharacterClass extends CharacterClass { - RegExpCharacterClass cc; - - InvertedCharacterClass() { this = getCharClassForCanonicalTerm(cc) and cc.isInverted() } - - override string getARelevantChar() { - result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or - nextChar(result) = caseNormalize(getAMentionedChar(cc), cc) - } - - bindingset[char] - override predicate matches(string char) { not hasChildThatMatches(cc, char) } - } - - /** - * Holds if the character class escape `clazz` (\d, \s, or \w) matches `char`. - */ - pragma[noinline] - private predicate classEscapeMatches(string clazz, string char) { - clazz = "d" and - char = "0123456789".charAt(_) - or - clazz = "s" and - char = [" ", "\t", "\r", "\n", 11.toUnicode(), 12.toUnicode()] // 11.toUnicode() = \v, 12.toUnicode() = \f - or - clazz = "w" and - char = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_".charAt(_) - } - - /** - * An implementation of `CharacterClass` for \d, \s, and \w. - */ - private class PositiveCharacterClassEscape extends CharacterClass { - string charClass; - RegExpTerm cc; - - PositiveCharacterClassEscape() { - isEscapeClass(cc, charClass) and - this = getCharClassForCanonicalTerm(cc) and - charClass = ["d", "s", "w"] - } - - override string getARelevantChar() { - charClass = "d" and - result = ["0", "9"] - or - charClass = "s" and - result = " " - or - charClass = "w" and - if RegExpFlags::isIgnoreCase(cc.getRootTerm()) - then result = ["a", "z", "_", "0", "9"] - else result = ["a", "Z", "_", "0", "9"] - } - - override predicate matches(string char) { classEscapeMatches(charClass, char) } - - override string choose() { - charClass = "d" and - result = "9" - or - charClass = "s" and - result = " " - or - charClass = "w" and - result = "a" - } - } - - /** - * An implementation of `CharacterClass` for \D, \S, and \W. - */ - private class NegativeCharacterClassEscape extends CharacterClass { - string charClass; - - NegativeCharacterClassEscape() { - exists(RegExpTerm cc | - isEscapeClass(cc, charClass) and - this = getCharClassForCanonicalTerm(cc) and - charClass = ["D", "S", "W"] - ) - } - - override string getARelevantChar() { - charClass = "D" and - result = ["a", "Z", "!"] - or - charClass = "S" and - result = ["a", "9", "!"] - or - charClass = "W" and - result = [" ", "!"] - } - - bindingset[char] - override predicate matches(string char) { - not classEscapeMatches(charClass.toLowerCase(), char) - } - } - - /** Gets a representative for all char classes that match the same chars as `c`. */ - CharacterClass normalize(CharacterClass c) { - exists(string normalization | - normalization = getNormalizationString(c) and - result = - min(CharacterClass cc, string raw | - getNormalizationString(cc) = normalization and cc = CharClass(raw) - | - cc order by raw - ) - ) - } - - /** Gets a string representing all the chars matched by `c` */ - private string getNormalizationString(CharacterClass c) { - (c instanceof PositiveCharacterClass or c instanceof PositiveCharacterClassEscape) and - result = concat(string char | c.matches(char) and char = CharacterClasses::getARelevantChar()) - or - (c instanceof InvertedCharacterClass or c instanceof NegativeCharacterClassEscape) and - // the string produced by the concat can not contain repeated chars - // so by starting the below with "nn" we can guarantee that - // it will not overlap with the above case. - // and a negative char class can never match the same chars as a positive one, so we don't miss any results from this. - result = - "nn:" + - concat(string char | not c.matches(char) and char = CharacterClasses::getARelevantChar()) - } -} - -private class EdgeLabel extends TInputSymbol { - string toString() { - this = Epsilon() and result = "" - or - exists(InputSymbol s | this = s and result = s.toString()) - } -} - -/** - * A RegExp term that acts like a plus. - * Either it's a RegExpPlus, or it is a range {1,X} where X is >= 30. - * 30 has been chosen as a threshold because for exponential blowup 2^30 is enough to get a decent DOS attack. - */ -private class EffectivelyPlus extends RegExpTerm { - EffectivelyPlus() { - this instanceof RegExpPlus - or - exists(RegExpRange range | - range.getLowerBound() = 1 and - (range.getUpperBound() >= 30 or not exists(range.getUpperBound())) - | - this = range - ) - } -} - -/** - * A RegExp term that acts like a star. - * Either it's a RegExpStar, or it is a range {0,X} where X is >= 30. - */ -private class EffectivelyStar extends RegExpTerm { - EffectivelyStar() { - this instanceof RegExpStar - or - exists(RegExpRange range | - range.getLowerBound() = 0 and - (range.getUpperBound() >= 30 or not exists(range.getUpperBound())) - | - this = range - ) - } -} - -/** - * A RegExp term that acts like a question mark. - * Either it's a RegExpQuestion, or it is a range {0,1}. - */ -private class EffectivelyQuestion extends RegExpTerm { - EffectivelyQuestion() { - this instanceof RegExpOpt - or - exists(RegExpRange range | range.getLowerBound() = 0 and range.getUpperBound() = 1 | - this = range - ) - } -} - -/** - * Gets the state before matching `t`. - */ -pragma[inline] -private State before(RegExpTerm t) { result = Match(t, 0) } - -/** - * Gets a state the NFA may be in after matching `t`. - */ -State after(RegExpTerm t) { - exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt)) - or - exists(RegExpSequence seq, int i | t = seq.getChild(i) | - result = before(seq.getChild(i + 1)) - or - i + 1 = seq.getNumChild() and result = after(seq) - ) - or - exists(RegExpGroup grp | t = grp.getAChild() | result = after(grp)) - or - exists(EffectivelyStar star | t = star.getAChild() | - not isPossessive(star) and - result = before(star) - ) - or - exists(EffectivelyPlus plus | t = plus.getAChild() | - not isPossessive(plus) and - result = before(plus) - or - result = after(plus) - ) - or - exists(EffectivelyQuestion opt | t = opt.getAChild() | result = after(opt)) - or - exists(RegExpRoot root | t = root | - if matchesAnySuffix(root) then result = AcceptAnySuffix(root) else result = Accept(root) - ) -} - -/** - * Holds if the NFA has a transition from `q1` to `q2` labelled with `lbl`. - */ -predicate delta(State q1, EdgeLabel lbl, State q2) { - exists(RegexpCharacterConstant s, int i | - q1 = Match(s, i) and - ( - not RegExpFlags::isIgnoreCase(s.getRootTerm()) and - lbl = Char(s.getValue().charAt(i)) - or - // normalize everything to lower case if the regexp is case insensitive - RegExpFlags::isIgnoreCase(s.getRootTerm()) and - exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase())) - ) and - ( - q2 = Match(s, i + 1) - or - s.getValue().length() = i + 1 and - q2 = after(s) - ) - ) - or - exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) | - if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot() - ) - or - exists(RegExpCharacterClass cc | - cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc) - or - q1 = before(cc) and - lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and - q2 = after(cc) - ) - or - exists(RegExpTerm cc | isEscapeClass(cc, _) | - q1 = before(cc) and - lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and - q2 = after(cc) - ) - or - exists(RegExpAlt alt | lbl = Epsilon() | q1 = before(alt) and q2 = before(alt.getAChild())) - or - exists(RegExpSequence seq | lbl = Epsilon() | q1 = before(seq) and q2 = before(seq.getChild(0))) - or - exists(RegExpGroup grp | lbl = Epsilon() | q1 = before(grp) and q2 = before(grp.getChild(0))) - or - exists(EffectivelyStar star | lbl = Epsilon() | - q1 = before(star) and q2 = before(star.getChild(0)) - or - q1 = before(star) and q2 = after(star) - ) - or - exists(EffectivelyPlus plus | lbl = Epsilon() | - q1 = before(plus) and q2 = before(plus.getChild(0)) - ) - or - exists(EffectivelyQuestion opt | lbl = Epsilon() | - q1 = before(opt) and q2 = before(opt.getChild(0)) - or - q1 = before(opt) and q2 = after(opt) - ) - or - exists(RegExpRoot root | q1 = AcceptAnySuffix(root) | - lbl = Any() and q2 = q1 - or - lbl = Epsilon() and q2 = Accept(root) - ) - or - exists(RegExpRoot root | q1 = Match(root, 0) | matchesAnyPrefix(root) and lbl = Any() and q2 = q1) - or - exists(RegExpDollar dollar | q1 = before(dollar) | - lbl = Epsilon() and q2 = Accept(getRoot(dollar)) - ) - or - exists(EmptyPositiveSubPattern empty | q1 = before(empty) | lbl = Epsilon() and q2 = after(empty)) -} - -/** - * Gets a state that `q` has an epsilon transition to. - */ -State epsilonSucc(State q) { delta(q, Epsilon(), result) } - -/** - * Gets a state that has an epsilon transition to `q`. - */ -State epsilonPred(State q) { q = epsilonSucc(result) } - -/** - * Holds if there is a state `q` that can be reached from `q1` - * along epsilon edges, such that there is a transition from - * `q` to `q2` that consumes symbol `s`. - */ -predicate deltaClosed(State q1, InputSymbol s, State q2) { delta(epsilonSucc*(q1), s, q2) } - -/** - * Gets the root containing the given term, that is, the root of the literal, - * or a branch of the root disjunction. - */ -RegExpRoot getRoot(RegExpTerm term) { - result = term or - result = getRoot(term.getParent()) -} - -/** - * A state in the NFA. - */ -newtype TState = - /** - * A state representing that the NFA is about to match a term. - * `i` is used to index into multi-char literals. - */ - Match(RelevantRegExpTerm t, int i) { - i = 0 - or - exists(t.(RegexpCharacterConstant).getValue().charAt(i)) - } or - /** - * An accept state, where exactly the given input string is accepted. - */ - Accept(RegExpRoot l) { l.isRelevant() } or - /** - * An accept state, where the given input string, or any string that has this - * string as a prefix, is accepted. - */ - AcceptAnySuffix(RegExpRoot l) { l.isRelevant() } - -/** - * Gets a state that is about to match the regular expression `t`. - */ -State mkMatch(RegExpTerm t) { result = Match(t, 0) } - -/** - * A state in the NFA corresponding to a regular expression. - * - * Each regular expression literal `l` has one accepting state - * `Accept(l)`, one state that accepts all suffixes `AcceptAnySuffix(l)`, - * and a state `Match(t, i)` for every subterm `t`, - * which represents the state of the NFA before starting to - * match `t`, or the `i`th character in `t` if `t` is a constant. - */ -class State extends TState { - RegExpTerm repr; - - State() { - this = Match(repr, _) or - this = Accept(repr) or - this = AcceptAnySuffix(repr) - } - - /** - * Gets a string representation for this state in a regular expression. - */ - string toString() { - exists(int i | this = Match(repr, i) | result = "Match(" + repr + "," + i + ")") - or - this instanceof Accept and - result = "Accept(" + repr + ")" - or - this instanceof AcceptAnySuffix and - result = "AcceptAny(" + repr + ")" - } - - /** - * Gets the location for this state. - */ - Location getLocation() { result = repr.getLocation() } - - /** - * Gets the term represented by this state. - */ - RegExpTerm getRepr() { result = repr } -} - -/** - * Gets the minimum char that is matched by both the character classes `c` and `d`. - */ -private string getMinOverlapBetweenCharacterClasses(CharacterClass c, CharacterClass d) { - result = min(getAOverlapBetweenCharacterClasses(c, d)) -} - -/** - * Gets a char that is matched by both the character classes `c` and `d`. - * And `c` and `d` is not the same character class. - */ -private string getAOverlapBetweenCharacterClasses(CharacterClass c, CharacterClass d) { - sharesRoot(c, d) and - result = [c.getARelevantChar(), d.getARelevantChar()] and - c.matches(result) and - d.matches(result) and - not c = d -} - -/** - * Gets a character that is represented by both `c` and `d`. - */ -string intersect(InputSymbol c, InputSymbol d) { - (sharesRoot(c, d) or [c, d] = Any()) and - ( - c = Char(result) and - d = getAnInputSymbolMatching(result) - or - result = getMinOverlapBetweenCharacterClasses(c, d) - or - result = c.(CharacterClass).choose() and - ( - d = c - or - d = Dot() and - not (result = "\n" or result = "\r") - or - d = Any() - ) - or - (c = Dot() or c = Any()) and - (d = Dot() or d = Any()) and - result = "a" - ) - or - result = intersect(d, c) -} - -/** - * Gets a symbol that matches `char`. - */ -bindingset[char] -InputSymbol getAnInputSymbolMatching(string char) { - result = Char(char) - or - result.(CharacterClass).matches(char) - or - result = Dot() and - not (char = "\n" or char = "\r") - or - result = Any() -} - -/** - * Holds if `state` is a start state. - */ -predicate isStartState(State state) { - state = mkMatch(any(RegExpRoot r)) - or - exists(RegExpCaret car | state = after(car)) -} - -/** - * Holds if `state` is a candidate for ReDoS with string `pump`. - */ -signature predicate isCandidateSig(State state, string pump); - -/** - * Holds if `state` is a candidate for ReDoS. - */ -signature predicate isCandidateSig(State state); - -/** - * Predicates for constructing a prefix string that leads to a given state. - */ -module PrefixConstruction { - /** - * Holds if `state` is the textually last start state for the regular expression. - */ - private predicate lastStartState(RelevantState state) { - exists(RegExpRoot root | - state = - max(RelevantState s, Location l | - isStartState(s) and - getRoot(s.getRepr()) = root and - l = s.getRepr().getLocation() - | - s - order by - l.getStartLine(), l.getStartColumn(), s.getRepr().toString(), l.getEndColumn(), - l.getEndLine() - ) - ) - } - - /** - * Holds if there exists any transition (Epsilon() or other) from `a` to `b`. - */ - private predicate existsTransition(State a, State b) { delta(a, _, b) } - - /** - * Gets the minimum number of transitions it takes to reach `state` from the `start` state. - */ - int prefixLength(State start, State state) = - shortestDistances(lastStartState/1, existsTransition/2)(start, state, result) - - /** - * Gets the minimum number of transitions it takes to reach `state` from the start state. - */ - private int lengthFromStart(State state) { result = prefixLength(_, state) } - - /** - * Gets a string for which the regular expression will reach `state`. - * - * Has at most one result for any given `state`. - * This predicate will not always have a result even if there is a ReDoS issue in - * the regular expression. - */ - string prefix(State state) { - lastStartState(state) and - result = "" - or - // the search stops past the last redos candidate state. - lengthFromStart(state) <= max(lengthFromStart(any(State s | isCandidate(s)))) and - exists(State prev | - // select a unique predecessor (by an arbitrary measure) - prev = - min(State s, Location loc | - lengthFromStart(s) = lengthFromStart(state) - 1 and - loc = s.getRepr().getLocation() and - delta(s, _, state) - | - s - order by - loc.getStartLine(), loc.getStartColumn(), loc.getEndLine(), loc.getEndColumn(), - s.getRepr().toString() - ) - | - // greedy search for the shortest prefix - result = prefix(prev) and delta(prev, Epsilon(), state) - or - not delta(prev, Epsilon(), state) and - result = prefix(prev) + getCanonicalEdgeChar(prev, state) - ) - } - - /** - * Gets a canonical char for which there exists a transition from `prev` to `next` in the NFA. - */ - private string getCanonicalEdgeChar(State prev, State next) { - result = - min(string c | delta(prev, any(InputSymbol symbol | c = intersect(Any(), symbol)), next)) - } - - /** A state within a regular expression that contains a candidate state. */ - class RelevantState instanceof State { - RelevantState() { - exists(State s | isCandidate(s) | getRoot(s.getRepr()) = getRoot(this.getRepr())) - } - - /** Gets a string representation for this state in a regular expression. */ - string toString() { result = State.super.toString() } - - /** Gets the term represented by this state. */ - RegExpTerm getRepr() { result = State.super.getRepr() } - } -} - -/** - * A module for pruning candidate ReDoS states. - * The candidates are specified by the `isCandidate` signature predicate. - * The candidates are checked for rejecting suffixes and deduplicated, - * and the resulting ReDoS states are read by the `hasReDoSResult` predicate. - */ -module ReDoSPruning { - /** - * Holds if repeating `pump` starting at `state` is a candidate for causing backtracking. - * No check whether a rejected suffix exists has been made. - */ - private predicate isReDoSCandidate(State state, string pump) { - isCandidate(state, pump) and - not state = acceptsAnySuffix() and // pruning early - these can never get stuck in a rejecting state. - ( - not isCandidate(epsilonSucc+(state), _) - or - epsilonSucc+(state) = state and - state = - max(State s, Location l | - s = epsilonSucc+(state) and - l = s.getRepr().getLocation() and - isCandidate(s, _) and - s.getRepr() instanceof InfiniteRepetitionQuantifier - | - s order by l.getStartLine(), l.getStartColumn(), l.getEndColumn(), l.getEndLine() - ) - ) - } - - /** Gets a state that can reach the `accept-any` state using only epsilon steps. */ - private State acceptsAnySuffix() { epsilonSucc*(result) = AcceptAnySuffix(_) } - - predicate isCandidateState(State s) { isReDoSCandidate(s, _) } - - import PrefixConstruction as Prefix - - class RelevantState = Prefix::RelevantState; - - /** - * Predicates for testing the presence of a rejecting suffix. - * - * These predicates are used to ensure that the all states reached from the fork - * by repeating `w` have a rejecting suffix. - * - * For example, a regexp like `/^(a+)+/` will accept any string as long the prefix is - * some number of `"a"`s, and it is therefore not possible to construct a rejecting suffix. - * - * A regexp like `/(a+)+$/` or `/(a+)+b/` trivially has a rejecting suffix, - * as the suffix "X" will cause both the regular expressions to be rejected. - * - * The string `w` is repeated any number of times because it needs to be - * infinitely repeatable for the attack to work. - * For the regular expression `/((ab)+)*abab/` the accepting state is not reachable from the fork - * using epsilon transitions. But any attempt at repeating `w` will end in a state that accepts all suffixes. - */ - private module SuffixConstruction { - /** - * Holds if all states reachable from `fork` by repeating `w` - * are likely rejectable by appending some suffix. - */ - predicate reachesOnlyRejectableSuffixes(State fork, string w) { - isReDoSCandidate(fork, w) and - forex(State next | next = process(fork, w, w.length() - 1) | isLikelyRejectable(next)) and - not getProcessPrevious(fork, _, w) = acceptsAnySuffix() // we stop `process(..)` early if we can, check here if it happened. - } - - /** - * Holds if there likely exists a suffix starting from `s` that leads to the regular expression being rejected. - * This predicate might find impossible suffixes when searching for suffixes of length > 1, which can cause FPs. - */ - pragma[noinline] - private predicate isLikelyRejectable(RelevantState s) { - // exists a reject edge with some char. - hasRejectEdge(s) - or - hasEdgeToLikelyRejectable(s) - or - // stopping here is rejection - isRejectState(s) - } - - /** - * Holds if `s` is not an accept state, and there is no epsilon transition to an accept state. - */ - predicate isRejectState(RelevantState s) { not epsilonSucc*(s) = Accept(_) } - - /** - * Holds if there is likely a non-empty suffix leading to rejection starting in `s`. - */ - pragma[noopt] - predicate hasEdgeToLikelyRejectable(RelevantState s) { - // all edges (at least one) with some char leads to another state that is rejectable. - // the `next` states might not share a common suffix, which can cause FPs. - exists(string char | char = hasEdgeToLikelyRejectableHelper(s) | - // noopt to force `hasEdgeToLikelyRejectableHelper` to be first in the join-order. - exists(State next | deltaClosedChar(s, char, next) | isLikelyRejectable(next)) and - forall(State next | deltaClosedChar(s, char, next) | isLikelyRejectable(next)) - ) - } - - /** - * Gets a char for there exists a transition away from `s`, - * and `s` has not been found to be rejectable by `hasRejectEdge` or `isRejectState`. - */ - pragma[noinline] - private string hasEdgeToLikelyRejectableHelper(RelevantState s) { - not hasRejectEdge(s) and - not isRejectState(s) and - deltaClosedChar(s, result, _) - } - - /** - * Holds if there is a state `next` that can be reached from `prev` - * along epsilon edges, such that there is a transition from - * `prev` to `next` that the character symbol `char`. - */ - predicate deltaClosedChar(RelevantState prev, string char, RelevantState next) { - deltaClosed(prev, getAnInputSymbolMatchingRelevant(char), next) - } - - pragma[noinline] - InputSymbol getAnInputSymbolMatchingRelevant(string char) { - char = relevant(_) and - result = getAnInputSymbolMatching(char) - } - - pragma[noinline] - RegExpRoot relevantRoot() { - exists(RegExpTerm term, State s | - s.getRepr() = term and isCandidateState(s) and result = term.getRootTerm() - ) - } - - /** - * Gets a char used for finding possible suffixes inside `root`. - */ - pragma[noinline] - private string relevant(RegExpRoot root) { - root = relevantRoot() and - ( - exists(ascii(result)) and exists(root) - or - exists(InputSymbol s | belongsTo(s, root) | result = intersect(s, _)) - or - // The characters from `hasSimpleRejectEdge`. Only `\n` is really needed (as `\n` is not in the `ascii` relation). - // The three chars must be kept in sync with `hasSimpleRejectEdge`. - result = ["|", "\n", "Z"] and exists(root) - ) - } - - /** - * Holds if there exists a `char` such that there is no edge from `s` labeled `char` in our NFA. - * The NFA does not model reject states, so the above is the same as saying there is a reject edge. - */ - private predicate hasRejectEdge(State s) { - hasSimpleRejectEdge(s) - or - not hasSimpleRejectEdge(s) and - exists(string char | char = relevant(getRoot(s.getRepr())) | not deltaClosedChar(s, char, _)) - } - - /** - * Holds if there is no edge from `s` labeled with "|", "\n", or "Z" in our NFA. - * This predicate is used as a cheap pre-processing to speed up `hasRejectEdge`. - */ - private predicate hasSimpleRejectEdge(State s) { - // The three chars were chosen arbitrarily. The three chars must be kept in sync with `relevant`. - exists(string char | char = ["|", "\n", "Z"] | not deltaClosedChar(s, char, _)) - } - - /** - * Gets a state that can be reached from pumpable `fork` consuming all - * chars in `w` any number of times followed by the first `i+1` characters of `w`. - */ - pragma[noopt] - private State process(State fork, string w, int i) { - exists(State prev | prev = getProcessPrevious(fork, i, w) | - not prev = acceptsAnySuffix() and // we stop `process(..)` early if we can. If the successor accepts any suffix, then we know it can never be rejected. - exists(string char, InputSymbol sym | - char = w.charAt(i) and - deltaClosed(prev, sym, result) and - // noopt to prevent joining `prev` with all possible `chars` that could transition away from `prev`. - // Instead only join with the set of `chars` where a relevant `InputSymbol` has already been found. - sym = getAProcessInputSymbol(char) - ) - ) - } - - /** - * Gets a state that can be reached from pumpable `fork` consuming all - * chars in `w` any number of times followed by the first `i` characters of `w`. - */ - private State getProcessPrevious(State fork, int i, string w) { - isReDoSCandidate(fork, w) and - ( - i = 0 and result = fork - or - result = process(fork, w, i - 1) - or - // repeat until fixpoint - i = 0 and - result = process(fork, w, w.length() - 1) - ) - } - - /** - * Gets an InputSymbol that matches `char`. - * The predicate is specialized to only have a result for the `char`s that are relevant for the `process` predicate. - */ - private InputSymbol getAProcessInputSymbol(string char) { - char = getAProcessChar() and - result = getAnInputSymbolMatching(char) - } - - /** - * Gets a `char` that occurs in a `pump` string. - */ - private string getAProcessChar() { result = any(string s | isReDoSCandidate(_, s)).charAt(_) } - } - - /** - * Holds if `term` may cause superlinear backtracking on strings containing many repetitions of `pump`. - * Gets the shortest string that causes superlinear backtracking. - */ - private predicate isReDoSAttackable(RegExpTerm term, string pump, State s) { - exists(int i, string c | s = Match(term, i) | - c = - min(string w | - isCandidate(s, w) and - SuffixConstruction::reachesOnlyRejectableSuffixes(s, w) - | - w order by w.length(), w - ) and - pump = escape(rotate(c, i)) - ) - } - - /** - * Holds if the state `s` (represented by the term `t`) can have backtracking with repetitions of `pump`. - * - * `prefixMsg` contains a friendly message for a prefix that reaches `s` (or `prefixMsg` is the empty string if the prefix is empty or if no prefix could be found). - */ - predicate hasReDoSResult(RegExpTerm t, string pump, State s, string prefixMsg) { - isReDoSAttackable(t, pump, s) and - ( - prefixMsg = "starting with '" + escape(Prefix::prefix(s)) + "' and " and - not Prefix::prefix(s) = "" - or - Prefix::prefix(s) = "" and prefixMsg = "" - or - not exists(Prefix::prefix(s)) and prefixMsg = "" - ) - } - - /** - * Gets the result of backslash-escaping newlines, carriage-returns and - * backslashes in `s`. - */ - bindingset[s] - private string escape(string s) { - result = - s.replaceAll("\\", "\\\\") - .replaceAll("\n", "\\n") - .replaceAll("\r", "\\r") - .replaceAll("\t", "\\t") - } - - /** - * Gets `str` with the last `i` characters moved to the front. - * - * We use this to adjust the pump string to match with the beginning of - * a RegExpTerm, so it doesn't start in the middle of a constant. - */ - bindingset[str, i] - private string rotate(string str, int i) { - result = str.suffix(str.length() - i) + str.prefix(str.length() - i) - } -} - -/** - * A module that describes a tree where each node has one or more associated characters, also known as a trie. - * The root node has no associated character. - * This module is a signature used in `Concretizer`. - */ -signature module CharTree { - /** A node in the tree. */ - class CharNode; - - /** Gets the previous node in the tree from `t`. */ - CharNode getPrev(CharNode t); - - /** - * Holds if `n` is at the end of a tree. I.e. a node that should have a result in the `Concretizer` module. - * Such a node can still have children. - */ - predicate isARelevantEnd(CharNode n); - - /** Gets a char associated with `t`. */ - string getChar(CharNode t); -} - -/** - * Implements an algorithm for computing all possible strings - * from following a tree of nodes (as described in `CharTree`). - * - * The string is build using one big concat, where all the chars are computed first. - * See `concretize`. - */ -module Concretizer { - private class Node = Impl::CharNode; - - private predicate getPrev = Impl::getPrev/1; - - private predicate isARelevantEnd = Impl::isARelevantEnd/1; - - private predicate getChar = Impl::getChar/1; - - /** Holds if `n` is on a path from the root to a leaf, and is therefore relevant for the results in `concretize`. */ - private predicate isRelevant(Node n) { - isARelevantEnd(n) - or - exists(Node succ | isRelevant(succ) | n = getPrev(succ)) - } - - /** Holds if `n` is a root with no predecessors. */ - private predicate isRoot(Node n) { not exists(getPrev(n)) } - - /** Gets the distance from a root to `n`. */ - private int nodeDepth(Node n) { - result = 0 and isRoot(n) - or - isRelevant(n) and - exists(Node prev | result = nodeDepth(prev) + 1 | prev = getPrev(n)) - } - - /** Gets an ancestor of `end`, where `end` is a node that should have a result in `concretize`. */ - private Node getAnAncestor(Node end) { isARelevantEnd(end) and result = getPrev*(end) } - - /** Gets the `i`th character on the path from the root to `n`. */ - pragma[noinline] - private string getPrefixChar(Node n, int i) { - exists(Node ancestor | - result = getChar(ancestor) and - ancestor = getAnAncestor(n) and - i = nodeDepth(ancestor) - ) - } - - /** Gets a string corresponding to `node`. */ - language[monotonicAggregates] - string concretize(Node n) { - result = strictconcat(int i | exists(getPrefixChar(n, i)) | getPrefixChar(n, i) order by i) - } -} +private import semmle.code.java.regex.RegexTreeView::RegexTreeView as TreeView +// NfaUtils should be used directly from the shared pack, and not from this file. +deprecated private import codeql.regex.nfa.NfaUtils::Make as Dep +import Dep diff --git a/java/ql/lib/semmle/code/java/security/regexp/NfaUtilsSpecific.qll b/java/ql/lib/semmle/code/java/security/regexp/NfaUtilsSpecific.qll deleted file mode 100644 index 742229eacca..00000000000 --- a/java/ql/lib/semmle/code/java/security/regexp/NfaUtilsSpecific.qll +++ /dev/null @@ -1,76 +0,0 @@ -/** - * This module should provide a class hierarchy corresponding to a parse tree of regular expressions. - * This is the interface to the shared ReDoS library. - */ - -private import java -import semmle.code.FileSystem -import semmle.code.java.regex.RegexTreeView - -/** - * Holds if `term` is an escape class representing e.g. `\d`. - * `clazz` is which character class it represents, e.g. "d" for `\d`. - */ -predicate isEscapeClass(RegExpTerm term, string clazz) { - term.(RegExpCharacterClassEscape).getValue() = clazz - or - term.(RegExpNamedProperty).getBackslashEquivalent() = clazz -} - -/** - * Holds if `term` is a possessive quantifier, e.g. `a*+`. - */ -predicate isPossessive(RegExpQuantifier term) { term.isPossessive() } - -/** - * Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against. - */ -predicate matchesAnyPrefix(RegExpTerm term) { not term.getRegex().matchesFullString() } - -/** - * Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against. - */ -predicate matchesAnySuffix(RegExpTerm term) { not term.getRegex().matchesFullString() } - -/** - * Holds if the regular expression should not be considered. - * - * We make the pragmatic performance optimization to ignore regular expressions in files - * that do not belong to the project code (such as installed dependencies). - */ -predicate isExcluded(RegExpParent parent) { - not exists(parent.getRegex().getLocation().getFile().getRelativePath()) - or - // Regexes with many occurrences of ".*" may cause the polynomial ReDoS computation to explode, so - // we explicitly exclude these. - strictcount(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10 -} - -/** - * A module containing predicates for determining which flags a regular expression have. - */ -module RegExpFlags { - /** - * Holds if `root` has the `i` flag for case-insensitive matching. - */ - predicate isIgnoreCase(RegExpTerm root) { - root.isRootTerm() and - root.getLiteral().isIgnoreCase() - } - - /** - * Gets the flags for `root`, or the empty string if `root` has no flags. - */ - deprecated string getFlags(RegExpTerm root) { - root.isRootTerm() and - result = root.getLiteral().getFlags() - } - - /** - * Holds if `root` has the `s` flag for multi-line matching. - */ - predicate isDotAll(RegExpTerm root) { - root.isRootTerm() and - root.getLiteral().isDotAll() - } -} diff --git a/java/ql/lib/semmle/code/java/security/regexp/PolynomialReDoSQuery.qll b/java/ql/lib/semmle/code/java/security/regexp/PolynomialReDoSQuery.qll index b0a8ff1a3c5..2a822ac69de 100644 --- a/java/ql/lib/semmle/code/java/security/regexp/PolynomialReDoSQuery.qll +++ b/java/ql/lib/semmle/code/java/security/regexp/PolynomialReDoSQuery.qll @@ -1,19 +1,19 @@ /** Definitions and configurations for the Polynomial ReDoS query */ -import semmle.code.java.security.regexp.SuperlinearBackTracking +private import semmle.code.java.regex.RegexTreeView::RegexTreeView as TreeView +import codeql.regex.nfa.SuperlinearBackTracking::Make as SuperlinearBackTracking import semmle.code.java.dataflow.DataFlow -import semmle.code.java.regex.RegexTreeView import semmle.code.java.regex.RegexFlowConfigs import semmle.code.java.dataflow.FlowSources /** A sink for polynomial redos queries, where a regex is matched. */ class PolynomialRedosSink extends DataFlow::Node { - RegExpLiteral reg; + TreeView::RegExpLiteral reg; PolynomialRedosSink() { regexMatchedAgainst(reg.getRegex(), this.asExpr()) } /** Gets the regex that is matched against this node. */ - RegExpTerm getRegExp() { result.getParent() = reg } + TreeView::RegExpTerm getRegExp() { result.getParent() = reg } } /** @@ -49,7 +49,8 @@ class PolynomialRedosConfig extends TaintTracking::Configuration { /** Holds if there is flow from `source` to `sink` that is matched against the regexp term `regexp` that is vulnerable to Polynomial ReDoS. */ predicate hasPolynomialReDoSResult( - DataFlow::PathNode source, DataFlow::PathNode sink, PolynomialBackTrackingTerm regexp + DataFlow::PathNode source, DataFlow::PathNode sink, + SuperlinearBackTracking::PolynomialBackTrackingTerm regexp ) { any(PolynomialRedosConfig config).hasFlowPath(source, sink) and regexp.getRootTerm() = sink.getNode().(PolynomialRedosSink).getRegExp() diff --git a/java/ql/lib/semmle/code/java/security/regexp/SuperlinearBackTracking.qll b/java/ql/lib/semmle/code/java/security/regexp/SuperlinearBackTracking.qll index 14a69dc0644..623b1540ef1 100644 --- a/java/ql/lib/semmle/code/java/security/regexp/SuperlinearBackTracking.qll +++ b/java/ql/lib/semmle/code/java/security/regexp/SuperlinearBackTracking.qll @@ -1,11 +1,4 @@ /** - * Provides classes for working with regular expressions that can - * perform backtracking in superlinear time. - */ - -import NfaUtils - -/* * This module implements the analysis described in the paper: * Valentin Wustholz, Oswaldo Olivo, Marijn J. H. Heule, and Isil Dillig: * Static Detection of DoS Vulnerabilities in @@ -42,377 +35,7 @@ import NfaUtils * It also doesn't find all transitions in the product automaton, which can cause false negatives. */ -/** - * Gets any root (start) state of a regular expression. - */ -private State getRootState() { result = mkMatch(any(RegExpRoot r)) } - -private newtype TStateTuple = - MkStateTuple(State q1, State q2, State q3) { - // starts at (pivot, pivot, succ) - isStartLoops(q1, q3) and q1 = q2 - or - step(_, _, _, _, q1, q2, q3) and FeasibleTuple::isFeasibleTuple(q1, q2, q3) - } - -/** - * A state in the product automaton. - * The product automaton contains 3-tuples of states. - * - * We lazily only construct those states that we are actually - * going to need. - * Either a start state `(pivot, pivot, succ)`, or a state - * where there exists a transition from an already existing state. - * - * The exponential variant of this query (`js/redos`) uses an optimization - * trick where `q1 <= q2`. This trick cannot be used here as the order - * of the elements matter. - */ -class StateTuple extends TStateTuple { - State q1; - State q2; - State q3; - - StateTuple() { this = MkStateTuple(q1, q2, q3) } - - /** - * Gest a string representation of this tuple. - */ - string toString() { result = "(" + q1 + ", " + q2 + ", " + q3 + ")" } - - /** - * Holds if this tuple is `(r1, r2, r3)`. - */ - pragma[noinline] - predicate isTuple(State r1, State r2, State r3) { r1 = q1 and r2 = q2 and r3 = q3 } -} - -/** - * A module for determining feasible tuples for the product automaton. - * - * The implementation is split into many predicates for performance reasons. - */ -private module FeasibleTuple { - /** - * Holds if the tuple `(r1, r2, r3)` might be on path from a start-state to an end-state in the product automaton. - */ - pragma[inline] - predicate isFeasibleTuple(State r1, State r2, State r3) { - // The first element is either inside a repetition (or the start state itself) - isRepetitionOrStart(r1) and - // The last element is inside a repetition - stateInsideRepetition(r3) and - // The states are reachable in the NFA in the order r1 -> r2 -> r3 - delta+(r1) = r2 and - delta+(r2) = r3 and - // The first element can reach a beginning (the "pivot" state in a `(pivot, succ)` pair). - canReachABeginning(r1) and - // The last element can reach a target (the "succ" state in a `(pivot, succ)` pair). - canReachATarget(r3) - } - - /** - * Holds if `s` is either inside a repetition, or is the start state (which is a repetition). - */ - pragma[noinline] - private predicate isRepetitionOrStart(State s) { stateInsideRepetition(s) or s = getRootState() } - - /** - * Holds if state `s` might be inside a backtracking repetition. - */ - pragma[noinline] - private predicate stateInsideRepetition(State s) { - s.getRepr().getParent*() instanceof InfiniteRepetitionQuantifier - } - - /** - * Holds if there exists a path in the NFA from `s` to a "pivot" state - * (from a `(pivot, succ)` pair that starts the search). - */ - pragma[noinline] - private predicate canReachABeginning(State s) { - delta+(s) = any(State pivot | isStartLoops(pivot, _)) - } - - /** - * Holds if there exists a path in the NFA from `s` to a "succ" state - * (from a `(pivot, succ)` pair that starts the search). - */ - pragma[noinline] - private predicate canReachATarget(State s) { delta+(s) = any(State succ | isStartLoops(_, succ)) } -} - -/** - * Holds if `pivot` and `succ` are a pair of loops that could be the beginning of a quadratic blowup. - * - * There is a slight implementation difference compared to the paper: this predicate requires that `pivot != succ`. - * The case where `pivot = succ` causes exponential backtracking and is handled by the `js/redos` query. - */ -predicate isStartLoops(State pivot, State succ) { - pivot != succ and - succ.getRepr() instanceof InfiniteRepetitionQuantifier and - delta+(pivot) = succ and - ( - pivot.getRepr() instanceof InfiniteRepetitionQuantifier - or - pivot = mkMatch(any(RegExpRoot root)) - ) -} - -/** - * Gets a state for which there exists a transition in the NFA from `s'. - */ -State delta(State s) { delta(s, _, result) } - -/** - * Holds if there are transitions from the components of `q` to the corresponding - * components of `r` labelled with `s1`, `s2`, and `s3`, respectively. - */ -pragma[noinline] -predicate step(StateTuple q, InputSymbol s1, InputSymbol s2, InputSymbol s3, StateTuple r) { - exists(State r1, State r2, State r3 | - step(q, s1, s2, s3, r1, r2, r3) and r = MkStateTuple(r1, r2, r3) - ) -} - -/** - * Holds if there are transitions from the components of `q` to `r1`, `r2`, and `r3 - * labelled with `s1`, `s2`, and `s3`, respectively. - */ -pragma[noopt] -predicate step( - StateTuple q, InputSymbol s1, InputSymbol s2, InputSymbol s3, State r1, State r2, State r3 -) { - exists(State q1, State q2, State q3 | q.isTuple(q1, q2, q3) | - deltaClosed(q1, s1, r1) and - deltaClosed(q2, s2, r2) and - deltaClosed(q3, s3, r3) and - // use noopt to force the join on `getAThreewayIntersect` to happen last. - exists(getAThreewayIntersect(s1, s2, s3)) - ) -} - -/** - * Gets a char that is matched by all the edges `s1`, `s2`, and `s3`. - * - * The result is not complete, and might miss some combination of edges that share some character. - */ -pragma[noinline] -string getAThreewayIntersect(InputSymbol s1, InputSymbol s2, InputSymbol s3) { - result = minAndMaxIntersect(s1, s2) and result = [intersect(s2, s3), intersect(s1, s3)] - or - result = minAndMaxIntersect(s1, s3) and result = [intersect(s2, s3), intersect(s1, s2)] - or - result = minAndMaxIntersect(s2, s3) and result = [intersect(s1, s2), intersect(s1, s3)] -} - -/** - * Gets the minimum and maximum characters that intersect between `a` and `b`. - * This predicate is used to limit the size of `getAThreewayIntersect`. - */ -pragma[noinline] -string minAndMaxIntersect(InputSymbol a, InputSymbol b) { - result = [min(intersect(a, b)), max(intersect(a, b))] -} - -private newtype TTrace = - Nil() or - Step(InputSymbol s1, InputSymbol s2, InputSymbol s3, TTrace t) { - isReachableFromStartTuple(_, _, t, s1, s2, s3, _, _) - } - -/** - * A list of tuples of input symbols that describe a path in the product automaton - * starting from some start state. - */ -class Trace extends TTrace { - /** - * Gets a string representation of this Trace that can be used for debug purposes. - */ - string toString() { - this = Nil() and result = "Nil()" - or - exists(InputSymbol s1, InputSymbol s2, InputSymbol s3, Trace t | this = Step(s1, s2, s3, t) | - result = "Step(" + s1 + ", " + s2 + ", " + s3 + ", " + t + ")" - ) - } -} - -/** - * Holds if there exists a transition from `r` to `q` in the product automaton. - * Notice that the arguments are flipped, and thus the direction is backwards. - */ -pragma[noinline] -predicate tupleDeltaBackwards(StateTuple q, StateTuple r) { step(r, _, _, _, q) } - -/** - * Holds if `tuple` is an end state in our search. - * That means there exists a pair of loops `(pivot, succ)` such that `tuple = (pivot, succ, succ)`. - */ -predicate isEndTuple(StateTuple tuple) { tuple = getAnEndTuple(_, _) } - -/** - * Gets the minimum length of a path from `r` to some an end state `end`. - * - * The implementation searches backwards from the end-tuple. - * This approach was chosen because it is way more efficient if the first predicate given to `shortestDistances` is small. - * The `end` argument must always be an end state. - */ -int distBackFromEnd(StateTuple r, StateTuple end) = - shortestDistances(isEndTuple/1, tupleDeltaBackwards/2)(end, r, result) - -/** - * Holds if there exists a pair of repetitions `(pivot, succ)` in the regular expression such that: - * `tuple` is reachable from `(pivot, pivot, succ)` in the product automaton, - * and there is a distance of `dist` from `tuple` to the nearest end-tuple `(pivot, succ, succ)`, - * and a path from a start-state to `tuple` follows the transitions in `trace`. - */ -private predicate isReachableFromStartTuple( - State pivot, State succ, StateTuple tuple, Trace trace, int dist -) { - exists(InputSymbol s1, InputSymbol s2, InputSymbol s3, Trace v | - isReachableFromStartTuple(pivot, succ, v, s1, s2, s3, tuple, dist) and - trace = Step(s1, s2, s3, v) - ) -} - -private predicate isReachableFromStartTuple( - State pivot, State succ, Trace trace, InputSymbol s1, InputSymbol s2, InputSymbol s3, - StateTuple tuple, int dist -) { - // base case. - exists(State q1, State q2, State q3 | - isStartLoops(pivot, succ) and - step(MkStateTuple(pivot, pivot, succ), s1, s2, s3, tuple) and - tuple = MkStateTuple(q1, q2, q3) and - trace = Nil() and - dist = distBackFromEnd(tuple, MkStateTuple(pivot, succ, succ)) - ) - or - // recursive case - exists(StateTuple p | - isReachableFromStartTuple(pivot, succ, p, trace, dist + 1) and - dist = distBackFromEnd(tuple, MkStateTuple(pivot, succ, succ)) and - step(p, s1, s2, s3, tuple) - ) -} - -/** - * Gets the tuple `(pivot, succ, succ)` from the product automaton. - */ -StateTuple getAnEndTuple(State pivot, State succ) { - isStartLoops(pivot, succ) and - result = MkStateTuple(pivot, succ, succ) -} - -/** An implementation of a chain containing chars for use by `Concretizer`. */ -private module CharTreeImpl implements CharTree { - class CharNode = Trace; - - CharNode getPrev(CharNode t) { t = Step(_, _, _, result) } - - /** Holds if `n` is used in `isPumpable`. */ - predicate isARelevantEnd(CharNode n) { - exists(State pivot, State succ | - isReachableFromStartTuple(pivot, succ, getAnEndTuple(pivot, succ), n, _) - ) - } - - string getChar(CharNode t) { - exists(InputSymbol s1, InputSymbol s2, InputSymbol s3 | t = Step(s1, s2, s3, _) | - result = getAThreewayIntersect(s1, s2, s3) - ) - } -} - -/** - * Holds if matching repetitions of `pump` can: - * 1) Transition from `pivot` back to `pivot`. - * 2) Transition from `pivot` to `succ`. - * 3) Transition from `succ` to `succ`. - * - * From theorem 3 in the paper linked in the top of this file we can therefore conclude that - * the regular expression has polynomial backtracking - if a rejecting suffix exists. - * - * This predicate is used by `SuperLinearReDoSConfiguration`, and the final results are - * available in the `hasReDoSResult` predicate. - */ -predicate isPumpable(State pivot, State succ, string pump) { - exists(StateTuple q, Trace t | - isReachableFromStartTuple(pivot, succ, q, t, _) and - q = getAnEndTuple(pivot, succ) and - pump = Concretizer ::concretize(t) - ) -} - -/** - * Holds if states starting in `state` can have polynomial backtracking with the string `pump`. - */ -predicate isReDoSCandidate(State state, string pump) { isPumpable(_, state, pump) } - -/** - * Holds if repetitions of `pump` at `t` will cause polynomial backtracking. - */ -predicate polynomialReDoS(RegExpTerm t, string pump, string prefixMsg, RegExpTerm prev) { - exists(State s, State pivot | - ReDoSPruning ::hasReDoSResult(t, pump, s, prefixMsg) and - isPumpable(pivot, s, _) and - prev = pivot.getRepr() - ) -} - -/** - * Gets a message for why `term` can cause polynomial backtracking. - */ -string getReasonString(RegExpTerm term, string pump, string prefixMsg, RegExpTerm prev) { - polynomialReDoS(term, pump, prefixMsg, prev) and - result = - "Strings " + prefixMsg + "with many repetitions of '" + pump + - "' can start matching anywhere after the start of the preceeding " + prev -} - -/** - * A term that may cause a regular expression engine to perform a - * polynomial number of match attempts, relative to the input length. - */ -class PolynomialBackTrackingTerm extends InfiniteRepetitionQuantifier { - string reason; - string pump; - string prefixMsg; - RegExpTerm prev; - - PolynomialBackTrackingTerm() { - reason = getReasonString(this, pump, prefixMsg, prev) and - // there might be many reasons for this term to have polynomial backtracking - we pick the shortest one. - reason = min(string msg | msg = getReasonString(this, _, _, _) | msg order by msg.length(), msg) - } - - /** - * Holds if all non-empty successors to the polynomial backtracking term matches the end of the line. - */ - predicate isAtEndLine() { - forall(RegExpTerm succ | this.getSuccessor+() = succ and not matchesEpsilon(succ) | - succ instanceof RegExpDollar - ) - } - - /** - * Gets the string that should be repeated to cause this regular expression to perform polynomially. - */ - string getPumpString() { result = pump } - - /** - * Gets a message for which prefix a matching string must start with for this term to cause polynomial backtracking. - */ - string getPrefixMessage() { result = prefixMsg } - - /** - * Gets a predecessor to `this`, which also loops on the pump string, and thereby causes polynomial backtracking. - */ - RegExpTerm getPreviousLoop() { result = prev } - - /** - * Gets the reason for the number of match attempts. - */ - string getReason() { result = reason } -} +private import semmle.code.java.regex.RegexTreeView::RegexTreeView as TreeView +// SuperlinearBackTracking should be used directly from the shared pack, and not from this file. +deprecated private import codeql.regex.nfa.SuperlinearBackTracking::Make as Dep +import Dep diff --git a/java/ql/lib/upgrades/709f1d1fd04ffd9bbcf242f17b120f8a389949bd/old.dbscheme b/java/ql/lib/upgrades/709f1d1fd04ffd9bbcf242f17b120f8a389949bd/old.dbscheme new file mode 100644 index 00000000000..709f1d1fd04 --- /dev/null +++ b/java/ql/lib/upgrades/709f1d1fd04ffd9bbcf242f17b120f8a389949bd/old.dbscheme @@ -0,0 +1,1240 @@ +/** + * An invocation of the compiler. Note that more than one file may be + * compiled per invocation. For example, this command compiles three + * source files: + * + * javac A.java B.java C.java + * + * The `id` simply identifies the invocation, while `cwd` is the working + * directory from which the compiler was invoked. + */ +compilations( + /** + * An invocation of the compiler. Note that more than one file may + * be compiled per invocation. For example, this command compiles + * three source files: + * + * javac A.java B.java C.java + */ + unique int id : @compilation, + int kind: int ref, + string cwd : string ref, + string name : string ref +); + +case @compilation.kind of + 1 = @javacompilation +| 2 = @kotlincompilation +; + +compilation_started( + int id : @compilation ref +) + +/** + * The arguments that were passed to the extractor for a compiler + * invocation. If `id` is for the compiler invocation + * + * javac A.java B.java C.java + * + * then typically there will be rows for + * + * num | arg + * --- | --- + * 0 | *path to extractor* + * 1 | `--javac-args` + * 2 | A.java + * 3 | B.java + * 4 | C.java + */ +#keyset[id, num] +compilation_args( + int id : @compilation ref, + int num : int ref, + string arg : string ref +); + +/** + * The source files that are compiled by a compiler invocation. + * If `id` is for the compiler invocation + * + * javac A.java B.java C.java + * + * then there will be rows for + * + * num | arg + * --- | --- + * 0 | A.java + * 1 | B.java + * 2 | C.java + */ +#keyset[id, num] +compilation_compiling_files( + int id : @compilation ref, + int num : int ref, + int file : @file ref +); + +/** + * For each file recorded in `compilation_compiling_files`, + * there will be a corresponding row in + * `compilation_compiling_files_completed` once extraction + * of that file is complete. The `result` will indicate the + * extraction result: + * + * 0: Successfully extracted + * 1: Errors were encountered, but extraction recovered + * 2: Errors were encountered, and extraction could not recover + */ +#keyset[id, num] +compilation_compiling_files_completed( + int id : @compilation ref, + int num : int ref, + int result : int ref +); + +/** + * The time taken by the extractor for a compiler invocation. + * + * For each file `num`, there will be rows for + * + * kind | seconds + * ---- | --- + * 1 | CPU seconds used by the extractor frontend + * 2 | Elapsed seconds during the extractor frontend + * 3 | CPU seconds used by the extractor backend + * 4 | Elapsed seconds during the extractor backend + */ +#keyset[id, num, kind] +compilation_time( + int id : @compilation ref, + int num : int ref, + /* kind: + 1 = frontend_cpu_seconds + 2 = frontend_elapsed_seconds + 3 = extractor_cpu_seconds + 4 = extractor_elapsed_seconds + */ + int kind : int ref, + float seconds : float ref +); + +/** + * An error or warning generated by the extractor. + * The diagnostic message `diagnostic` was generated during compiler + * invocation `compilation`, and is the `file_number_diagnostic_number`th + * message generated while extracting the `file_number`th file of that + * invocation. + */ +#keyset[compilation, file_number, file_number_diagnostic_number] +diagnostic_for( + unique int diagnostic : @diagnostic ref, + int compilation : @compilation ref, + int file_number : int ref, + int file_number_diagnostic_number : int ref +); + +/** + * The `cpu_seconds` and `elapsed_seconds` are the CPU time and elapsed + * time (respectively) that the original compilation (not the extraction) + * took for compiler invocation `id`. + */ +compilation_compiler_times( + unique int id : @compilation ref, + float cpu_seconds : float ref, + float elapsed_seconds : float ref +); + +/** + * If extraction was successful, then `cpu_seconds` and + * `elapsed_seconds` are the CPU time and elapsed time (respectively) + * that extraction took for compiler invocation `id`. + * The `result` will indicate the extraction result: + * + * 0: Successfully extracted + * 1: Errors were encountered, but extraction recovered + * 2: Errors were encountered, and extraction could not recover + */ +compilation_finished( + unique int id : @compilation ref, + float cpu_seconds : float ref, + float elapsed_seconds : float ref, + int result : int ref +); + +diagnostics( + unique int id: @diagnostic, + string generated_by: string ref, // TODO: Sync this with the other languages? + int severity: int ref, + string error_tag: string ref, + string error_message: string ref, + string full_error_message: string ref, + int location: @location_default ref +); + +/* + * External artifacts + */ + +externalData( + int id : @externalDataElement, + string path : string ref, + int column: int ref, + string value : string ref +); + +snapshotDate( + unique date snapshotDate : date ref +); + +sourceLocationPrefix( + string prefix : string ref +); + +/* + * Duplicate code + */ + +duplicateCode( + unique int id : @duplication, + string relativePath : string ref, + int equivClass : int ref +); + +similarCode( + unique int id : @similarity, + string relativePath : string ref, + int equivClass : int ref +); + +@duplication_or_similarity = @duplication | @similarity + +tokens( + int id : @duplication_or_similarity ref, + int offset : int ref, + int beginLine : int ref, + int beginColumn : int ref, + int endLine : int ref, + int endColumn : int ref +); + +/* + * SMAP + */ + +smap_header( + int outputFileId: @file ref, + string outputFilename: string ref, + string defaultStratum: string ref +); + +smap_files( + int outputFileId: @file ref, + string stratum: string ref, + int inputFileNum: int ref, + string inputFileName: string ref, + int inputFileId: @file ref +); + +smap_lines( + int outputFileId: @file ref, + string stratum: string ref, + int inputFileNum: int ref, + int inputStartLine: int ref, + int inputLineCount: int ref, + int outputStartLine: int ref, + int outputLineIncrement: int ref +); + +/* + * Locations and files + */ + +@location = @location_default ; + +locations_default( + unique int id: @location_default, + int file: @file ref, + int beginLine: int ref, + int beginColumn: int ref, + int endLine: int ref, + int endColumn: int ref +); + +hasLocation( + int locatableid: @locatable ref, + int id: @location ref +); + +@sourceline = @locatable ; + +#keyset[element_id] +numlines( + int element_id: @sourceline ref, + int num_lines: int ref, + int num_code: int ref, + int num_comment: int ref +); + +files( + unique int id: @file, + string name: string ref +); + +folders( + unique int id: @folder, + string name: string ref +); + +@container = @folder | @file + +containerparent( + int parent: @container ref, + unique int child: @container ref +); + +/* + * Java + */ + +cupackage( + unique int id: @file ref, + int packageid: @package ref +); + +#keyset[fileid,keyName] +jarManifestMain( + int fileid: @file ref, + string keyName: string ref, + string value: string ref +); + +#keyset[fileid,entryName,keyName] +jarManifestEntries( + int fileid: @file ref, + string entryName: string ref, + string keyName: string ref, + string value: string ref +); + +packages( + unique int id: @package, + string nodeName: string ref +); + +primitives( + unique int id: @primitive, + string nodeName: string ref +); + +modifiers( + unique int id: @modifier, + string nodeName: string ref +); + +/** + * An errortype is used when the extractor is unable to extract a type + * correctly for some reason. + */ +error_type( + unique int id: @errortype +); + +classes( + unique int id: @class, + string nodeName: string ref, + int parentid: @package ref, + int sourceid: @class ref +); + +file_class( + int id: @class ref +); + +class_object( + unique int id: @class ref, + unique int instance: @field ref +); + +type_companion_object( + unique int id: @classorinterface ref, + unique int instance: @field ref, + unique int companion_object: @class ref +); + +kt_nullable_types( + unique int id: @kt_nullable_type, + int classid: @reftype ref +) + +kt_notnull_types( + unique int id: @kt_notnull_type, + int classid: @reftype ref +) + +kt_type_alias( + unique int id: @kt_type_alias, + string name: string ref, + int kttypeid: @kt_type ref +) + +@kt_type = @kt_nullable_type | @kt_notnull_type + +isRecord( + unique int id: @class ref +); + +interfaces( + unique int id: @interface, + string nodeName: string ref, + int parentid: @package ref, + int sourceid: @interface ref +); + +fielddecls( + unique int id: @fielddecl, + int parentid: @reftype ref +); + +#keyset[fieldId] #keyset[fieldDeclId,pos] +fieldDeclaredIn( + int fieldId: @field ref, + int fieldDeclId: @fielddecl ref, + int pos: int ref +); + +fields( + unique int id: @field, + string nodeName: string ref, + int typeid: @type ref, + int parentid: @reftype ref, + int sourceid: @field ref +); + +fieldsKotlinType( + unique int id: @field ref, + int kttypeid: @kt_type ref +); + +constrs( + unique int id: @constructor, + string nodeName: string ref, + string signature: string ref, + int typeid: @type ref, + int parentid: @reftype ref, + int sourceid: @constructor ref +); + +constrsKotlinType( + unique int id: @constructor ref, + int kttypeid: @kt_type ref +); + +methods( + unique int id: @method, + string nodeName: string ref, + string signature: string ref, + int typeid: @type ref, + int parentid: @reftype ref, + int sourceid: @method ref +); + +methodsKotlinType( + unique int id: @method ref, + int kttypeid: @kt_type ref +); + +#keyset[parentid,pos] +params( + unique int id: @param, + int typeid: @type ref, + int pos: int ref, + int parentid: @callable ref, + int sourceid: @param ref +); + +paramsKotlinType( + unique int id: @param ref, + int kttypeid: @kt_type ref +); + +paramName( + unique int id: @param ref, + string nodeName: string ref +); + +isVarargsParam( + int param: @param ref +); + +exceptions( + unique int id: @exception, + int typeid: @type ref, + int parentid: @callable ref +); + +isAnnotType( + int interfaceid: @interface ref +); + +isAnnotElem( + int methodid: @method ref +); + +annotValue( + int parentid: @annotation ref, + int id2: @method ref, + unique int value: @expr ref +); + +isEnumType( + int classid: @class ref +); + +isEnumConst( + int fieldid: @field ref +); + +#keyset[parentid,pos] +typeVars( + unique int id: @typevariable, + string nodeName: string ref, + int pos: int ref, + int kind: int ref, // deprecated + int parentid: @classorinterfaceorcallable ref +); + +wildcards( + unique int id: @wildcard, + string nodeName: string ref, + int kind: int ref +); + +#keyset[parentid,pos] +typeBounds( + unique int id: @typebound, + int typeid: @reftype ref, + int pos: int ref, + int parentid: @boundedtype ref +); + +#keyset[parentid,pos] +typeArgs( + int argumentid: @reftype ref, + int pos: int ref, + int parentid: @classorinterfaceorcallable ref +); + +isParameterized( + int memberid: @member ref +); + +isRaw( + int memberid: @member ref +); + +erasure( + unique int memberid: @member ref, + int erasureid: @member ref +); + +#keyset[classid] #keyset[parent] +isAnonymClass( + int classid: @class ref, + int parent: @classinstancexpr ref +); + +#keyset[typeid] #keyset[parent] +isLocalClassOrInterface( + int typeid: @classorinterface ref, + int parent: @localtypedeclstmt ref +); + +isDefConstr( + int constructorid: @constructor ref +); + +#keyset[exprId] +lambdaKind( + int exprId: @lambdaexpr ref, + int bodyKind: int ref +); + +arrays( + unique int id: @array, + string nodeName: string ref, + int elementtypeid: @type ref, + int dimension: int ref, + int componenttypeid: @type ref +); + +enclInReftype( + unique int child: @reftype ref, + int parent: @reftype ref +); + +extendsReftype( + int id1: @reftype ref, + int id2: @classorinterface ref +); + +implInterface( + int id1: @classorarray ref, + int id2: @interface ref +); + +permits( + int id1: @classorinterface ref, + int id2: @classorinterface ref +); + +hasModifier( + int id1: @modifiable ref, + int id2: @modifier ref +); + +imports( + unique int id: @import, + int holder: @classorinterfaceorpackage ref, + string name: string ref, + int kind: int ref +); + +#keyset[parent,idx] +stmts( + unique int id: @stmt, + int kind: int ref, + int parent: @stmtparent ref, + int idx: int ref, + int bodydecl: @callable ref +); + +@stmtparent = @callable | @stmt | @switchexpr | @whenexpr| @stmtexpr; + +case @stmt.kind of + 0 = @block +| 1 = @ifstmt +| 2 = @forstmt +| 3 = @enhancedforstmt +| 4 = @whilestmt +| 5 = @dostmt +| 6 = @trystmt +| 7 = @switchstmt +| 8 = @synchronizedstmt +| 9 = @returnstmt +| 10 = @throwstmt +| 11 = @breakstmt +| 12 = @continuestmt +| 13 = @emptystmt +| 14 = @exprstmt +| 15 = @labeledstmt +| 16 = @assertstmt +| 17 = @localvariabledeclstmt +| 18 = @localtypedeclstmt +| 19 = @constructorinvocationstmt +| 20 = @superconstructorinvocationstmt +| 21 = @case +| 22 = @catchclause +| 23 = @yieldstmt +| 24 = @errorstmt +| 25 = @whenbranch +; + +#keyset[parent,idx] +exprs( + unique int id: @expr, + int kind: int ref, + int typeid: @type ref, + int parent: @exprparent ref, + int idx: int ref +); + +exprsKotlinType( + unique int id: @expr ref, + int kttypeid: @kt_type ref +); + +callableEnclosingExpr( + unique int id: @expr ref, + int callable_id: @callable ref +); + +statementEnclosingExpr( + unique int id: @expr ref, + int statement_id: @stmt ref +); + +isParenthesized( + unique int id: @expr ref, + int parentheses: int ref +); + +case @expr.kind of + 1 = @arrayaccess +| 2 = @arraycreationexpr +| 3 = @arrayinit +| 4 = @assignexpr +| 5 = @assignaddexpr +| 6 = @assignsubexpr +| 7 = @assignmulexpr +| 8 = @assigndivexpr +| 9 = @assignremexpr +| 10 = @assignandexpr +| 11 = @assignorexpr +| 12 = @assignxorexpr +| 13 = @assignlshiftexpr +| 14 = @assignrshiftexpr +| 15 = @assignurshiftexpr +| 16 = @booleanliteral +| 17 = @integerliteral +| 18 = @longliteral +| 19 = @floatingpointliteral +| 20 = @doubleliteral +| 21 = @characterliteral +| 22 = @stringliteral +| 23 = @nullliteral +| 24 = @mulexpr +| 25 = @divexpr +| 26 = @remexpr +| 27 = @addexpr +| 28 = @subexpr +| 29 = @lshiftexpr +| 30 = @rshiftexpr +| 31 = @urshiftexpr +| 32 = @andbitexpr +| 33 = @orbitexpr +| 34 = @xorbitexpr +| 35 = @andlogicalexpr +| 36 = @orlogicalexpr +| 37 = @ltexpr +| 38 = @gtexpr +| 39 = @leexpr +| 40 = @geexpr +| 41 = @eqexpr +| 42 = @neexpr +| 43 = @postincexpr +| 44 = @postdecexpr +| 45 = @preincexpr +| 46 = @predecexpr +| 47 = @minusexpr +| 48 = @plusexpr +| 49 = @bitnotexpr +| 50 = @lognotexpr +| 51 = @castexpr +| 52 = @newexpr +| 53 = @conditionalexpr +| 54 = @parexpr // deprecated +| 55 = @instanceofexpr +| 56 = @localvariabledeclexpr +| 57 = @typeliteral +| 58 = @thisaccess +| 59 = @superaccess +| 60 = @varaccess +| 61 = @methodaccess +| 62 = @unannotatedtypeaccess +| 63 = @arraytypeaccess +| 64 = @packageaccess +| 65 = @wildcardtypeaccess +| 66 = @declannotation +| 67 = @uniontypeaccess +| 68 = @lambdaexpr +| 69 = @memberref +| 70 = @annotatedtypeaccess +| 71 = @typeannotation +| 72 = @intersectiontypeaccess +| 73 = @switchexpr +| 74 = @errorexpr +| 75 = @whenexpr +| 76 = @getclassexpr +| 77 = @safecastexpr +| 78 = @implicitcastexpr +| 79 = @implicitnotnullexpr +| 80 = @implicitcoerciontounitexpr +| 81 = @notinstanceofexpr +| 82 = @stmtexpr +| 83 = @stringtemplateexpr +| 84 = @notnullexpr +| 85 = @unsafecoerceexpr +| 86 = @valueeqexpr +| 87 = @valueneexpr +| 88 = @propertyref +; + +/** Holds if this `when` expression was written as an `if` expression. */ +when_if(unique int id: @whenexpr ref); + +/** Holds if this `when` branch was written as an `else` branch. */ +when_branch_else(unique int id: @whenbranch ref); + +@classinstancexpr = @newexpr | @lambdaexpr | @memberref | @propertyref + +@annotation = @declannotation | @typeannotation +@typeaccess = @unannotatedtypeaccess | @annotatedtypeaccess + +@assignment = @assignexpr + | @assignop; + +@unaryassignment = @postincexpr + | @postdecexpr + | @preincexpr + | @predecexpr; + +@assignop = @assignaddexpr + | @assignsubexpr + | @assignmulexpr + | @assigndivexpr + | @assignremexpr + | @assignandexpr + | @assignorexpr + | @assignxorexpr + | @assignlshiftexpr + | @assignrshiftexpr + | @assignurshiftexpr; + +@literal = @booleanliteral + | @integerliteral + | @longliteral + | @floatingpointliteral + | @doubleliteral + | @characterliteral + | @stringliteral + | @nullliteral; + +@binaryexpr = @mulexpr + | @divexpr + | @remexpr + | @addexpr + | @subexpr + | @lshiftexpr + | @rshiftexpr + | @urshiftexpr + | @andbitexpr + | @orbitexpr + | @xorbitexpr + | @andlogicalexpr + | @orlogicalexpr + | @ltexpr + | @gtexpr + | @leexpr + | @geexpr + | @eqexpr + | @neexpr + | @valueeqexpr + | @valueneexpr; + +@unaryexpr = @postincexpr + | @postdecexpr + | @preincexpr + | @predecexpr + | @minusexpr + | @plusexpr + | @bitnotexpr + | @lognotexpr + | @notnullexpr; + +@caller = @classinstancexpr + | @methodaccess + | @constructorinvocationstmt + | @superconstructorinvocationstmt; + +callableBinding( + unique int callerid: @caller ref, + int callee: @callable ref +); + +memberRefBinding( + unique int id: @expr ref, + int callable: @callable ref +); + +propertyRefGetBinding( + unique int id: @expr ref, + int getter: @callable ref +); + +propertyRefFieldBinding( + unique int id: @expr ref, + int field: @field ref +); + +propertyRefSetBinding( + unique int id: @expr ref, + int setter: @callable ref +); + +@exprparent = @stmt | @expr | @whenbranch | @callable | @field | @fielddecl | @class | @interface | @param | @localvar | @typevariable; + +variableBinding( + unique int expr: @varaccess ref, + int variable: @variable ref +); + +@variable = @localscopevariable | @field; + +@localscopevariable = @localvar | @param; + +localvars( + unique int id: @localvar, + string nodeName: string ref, + int typeid: @type ref, + int parentid: @localvariabledeclexpr ref +); + +localvarsKotlinType( + unique int id: @localvar ref, + int kttypeid: @kt_type ref +); + +@namedexprorstmt = @breakstmt + | @continuestmt + | @labeledstmt + | @literal; + +namestrings( + string name: string ref, + string value: string ref, + unique int parent: @namedexprorstmt ref +); + +/* + * Modules + */ + +#keyset[name] +modules( + unique int id: @module, + string name: string ref +); + +isOpen( + int id: @module ref +); + +#keyset[fileId] +cumodule( + int fileId: @file ref, + int moduleId: @module ref +); + +@directive = @requires + | @exports + | @opens + | @uses + | @provides + +#keyset[directive] +directives( + int id: @module ref, + int directive: @directive ref +); + +requires( + unique int id: @requires, + int target: @module ref +); + +isTransitive( + int id: @requires ref +); + +isStatic( + int id: @requires ref +); + +exports( + unique int id: @exports, + int target: @package ref +); + +exportsTo( + int id: @exports ref, + int target: @module ref +); + +opens( + unique int id: @opens, + int target: @package ref +); + +opensTo( + int id: @opens ref, + int target: @module ref +); + +uses( + unique int id: @uses, + string serviceInterface: string ref +); + +provides( + unique int id: @provides, + string serviceInterface: string ref +); + +providesWith( + int id: @provides ref, + string serviceImpl: string ref +); + +/* + * Javadoc + */ + +javadoc( + unique int id: @javadoc +); + +isNormalComment( + int commentid : @javadoc ref +); + +isEolComment( + int commentid : @javadoc ref +); + +hasJavadoc( + int documentableid: @member ref, + int javadocid: @javadoc ref +); + +#keyset[parentid,idx] +javadocTag( + unique int id: @javadocTag, + string name: string ref, + int parentid: @javadocParent ref, + int idx: int ref +); + +#keyset[parentid,idx] +javadocText( + unique int id: @javadocText, + string text: string ref, + int parentid: @javadocParent ref, + int idx: int ref +); + +@javadocParent = @javadoc | @javadocTag; +@javadocElement = @javadocTag | @javadocText; + +@classorinterface = @interface | @class; +@classorinterfaceorpackage = @classorinterface | @package; +@classorinterfaceorcallable = @classorinterface | @callable; +@boundedtype = @typevariable | @wildcard; +@reftype = @classorinterface | @array | @boundedtype | @errortype; +@classorarray = @class | @array; +@type = @primitive | @reftype; +@callable = @method | @constructor; + +/** A program element that has a name. */ +@element = @package | @modifier | @annotation | @errortype | + @locatableElement; + +@locatableElement = @file | @primitive | @class | @interface | @method | @constructor | @param | @exception | @field | + @boundedtype | @array | @localvar | @expr | @stmt | @import | @fielddecl | @kt_type | @kt_type_alias | + @kt_property; + +@modifiable = @member_modifiable| @param | @localvar | @typevariable; + +@member_modifiable = @class | @interface | @method | @constructor | @field | @kt_property; + +@member = @method | @constructor | @field | @reftype ; + +/** A program element that has a location. */ +@locatable = @typebound | @javadoc | @javadocTag | @javadocText | @xmllocatable | @ktcomment | + @locatableElement; + +@top = @element | @locatable | @folder; + +/* + * XML Files + */ + +xmlEncoding( + unique int id: @file ref, + string encoding: string ref +); + +xmlDTDs( + unique int id: @xmldtd, + string root: string ref, + string publicId: string ref, + string systemId: string ref, + int fileid: @file ref +); + +xmlElements( + unique int id: @xmlelement, + string name: string ref, + int parentid: @xmlparent ref, + int idx: int ref, + int fileid: @file ref +); + +xmlAttrs( + unique int id: @xmlattribute, + int elementid: @xmlelement ref, + string name: string ref, + string value: string ref, + int idx: int ref, + int fileid: @file ref +); + +xmlNs( + int id: @xmlnamespace, + string prefixName: string ref, + string URI: string ref, + int fileid: @file ref +); + +xmlHasNs( + int elementId: @xmlnamespaceable ref, + int nsId: @xmlnamespace ref, + int fileid: @file ref +); + +xmlComments( + unique int id: @xmlcomment, + string text: string ref, + int parentid: @xmlparent ref, + int fileid: @file ref +); + +xmlChars( + unique int id: @xmlcharacters, + string text: string ref, + int parentid: @xmlparent ref, + int idx: int ref, + int isCDATA: int ref, + int fileid: @file ref +); + +@xmlparent = @file | @xmlelement; +@xmlnamespaceable = @xmlelement | @xmlattribute; + +xmllocations( + int xmlElement: @xmllocatable ref, + int location: @location_default ref +); + +@xmllocatable = @xmlcharacters | @xmlelement | @xmlcomment | @xmlattribute | @xmldtd | @file | @xmlnamespace; + +/* + * configuration files with key value pairs + */ + +configs( + unique int id: @config +); + +configNames( + unique int id: @configName, + int config: @config ref, + string name: string ref +); + +configValues( + unique int id: @configValue, + int config: @config ref, + string value: string ref +); + +configLocations( + int locatable: @configLocatable ref, + int location: @location_default ref +); + +@configLocatable = @config | @configName | @configValue; + +ktComments( + unique int id: @ktcomment, + int kind: int ref, + string text : string ref +) + +ktCommentSections( + unique int id: @ktcommentsection, + int comment: @ktcomment ref, + string content : string ref +) + +ktCommentSectionNames( + unique int id: @ktcommentsection ref, + string name : string ref +) + +ktCommentSectionSubjectNames( + unique int id: @ktcommentsection ref, + string subjectname : string ref +) + +#keyset[id, owner] +ktCommentOwners( + int id: @ktcomment ref, + int owner: @top ref +) + +ktExtensionFunctions( + unique int id: @method ref, + int typeid: @type ref, + int kttypeid: @kt_type ref +) + +ktProperties( + unique int id: @kt_property, + string nodeName: string ref +) + +ktPropertyGetters( + unique int id: @kt_property ref, + int getter: @method ref +) + +ktPropertySetters( + unique int id: @kt_property ref, + int setter: @method ref +) + +ktPropertyBackingFields( + unique int id: @kt_property ref, + int backingField: @field ref +) + +ktSyntheticBody( + unique int id: @callable ref, + int kind: int ref + // 1: ENUM_VALUES + // 2: ENUM_VALUEOF +) + +ktLocalFunction( + unique int id: @method ref +) + +ktInitializerAssignment( + unique int id: @assignexpr ref +) + +ktPropertyDelegates( + unique int id: @kt_property ref, + unique int variableId: @variable ref +) + +/** + * If `id` is a compiler generated element, then the kind indicates the + * reason that the compiler generated it. + * See `Element.compilerGeneratedReason()` for an explanation of what + * each `kind` means. + */ +compiler_generated( + unique int id: @element ref, + int kind: int ref +) + +ktFunctionOriginalNames( + unique int id: @method ref, + string name: string ref +) + +ktDataClasses( + unique int id: @class ref +) diff --git a/java/ql/lib/upgrades/709f1d1fd04ffd9bbcf242f17b120f8a389949bd/semmlecode.dbscheme b/java/ql/lib/upgrades/709f1d1fd04ffd9bbcf242f17b120f8a389949bd/semmlecode.dbscheme new file mode 100644 index 00000000000..44d61b266be --- /dev/null +++ b/java/ql/lib/upgrades/709f1d1fd04ffd9bbcf242f17b120f8a389949bd/semmlecode.dbscheme @@ -0,0 +1,1246 @@ +/** + * An invocation of the compiler. Note that more than one file may be + * compiled per invocation. For example, this command compiles three + * source files: + * + * javac A.java B.java C.java + * + * The `id` simply identifies the invocation, while `cwd` is the working + * directory from which the compiler was invoked. + */ +compilations( + /** + * An invocation of the compiler. Note that more than one file may + * be compiled per invocation. For example, this command compiles + * three source files: + * + * javac A.java B.java C.java + */ + unique int id : @compilation, + int kind: int ref, + string cwd : string ref, + string name : string ref +); + +case @compilation.kind of + 1 = @javacompilation +| 2 = @kotlincompilation +; + +compilation_started( + int id : @compilation ref +) + +compilation_info( + int id : @compilation ref, + string info_key: string ref, + string info_value: string ref +) + +/** + * The arguments that were passed to the extractor for a compiler + * invocation. If `id` is for the compiler invocation + * + * javac A.java B.java C.java + * + * then typically there will be rows for + * + * num | arg + * --- | --- + * 0 | *path to extractor* + * 1 | `--javac-args` + * 2 | A.java + * 3 | B.java + * 4 | C.java + */ +#keyset[id, num] +compilation_args( + int id : @compilation ref, + int num : int ref, + string arg : string ref +); + +/** + * The source files that are compiled by a compiler invocation. + * If `id` is for the compiler invocation + * + * javac A.java B.java C.java + * + * then there will be rows for + * + * num | arg + * --- | --- + * 0 | A.java + * 1 | B.java + * 2 | C.java + */ +#keyset[id, num] +compilation_compiling_files( + int id : @compilation ref, + int num : int ref, + int file : @file ref +); + +/** + * For each file recorded in `compilation_compiling_files`, + * there will be a corresponding row in + * `compilation_compiling_files_completed` once extraction + * of that file is complete. The `result` will indicate the + * extraction result: + * + * 0: Successfully extracted + * 1: Errors were encountered, but extraction recovered + * 2: Errors were encountered, and extraction could not recover + */ +#keyset[id, num] +compilation_compiling_files_completed( + int id : @compilation ref, + int num : int ref, + int result : int ref +); + +/** + * The time taken by the extractor for a compiler invocation. + * + * For each file `num`, there will be rows for + * + * kind | seconds + * ---- | --- + * 1 | CPU seconds used by the extractor frontend + * 2 | Elapsed seconds during the extractor frontend + * 3 | CPU seconds used by the extractor backend + * 4 | Elapsed seconds during the extractor backend + */ +#keyset[id, num, kind] +compilation_time( + int id : @compilation ref, + int num : int ref, + /* kind: + 1 = frontend_cpu_seconds + 2 = frontend_elapsed_seconds + 3 = extractor_cpu_seconds + 4 = extractor_elapsed_seconds + */ + int kind : int ref, + float seconds : float ref +); + +/** + * An error or warning generated by the extractor. + * The diagnostic message `diagnostic` was generated during compiler + * invocation `compilation`, and is the `file_number_diagnostic_number`th + * message generated while extracting the `file_number`th file of that + * invocation. + */ +#keyset[compilation, file_number, file_number_diagnostic_number] +diagnostic_for( + unique int diagnostic : @diagnostic ref, + int compilation : @compilation ref, + int file_number : int ref, + int file_number_diagnostic_number : int ref +); + +/** + * The `cpu_seconds` and `elapsed_seconds` are the CPU time and elapsed + * time (respectively) that the original compilation (not the extraction) + * took for compiler invocation `id`. + */ +compilation_compiler_times( + unique int id : @compilation ref, + float cpu_seconds : float ref, + float elapsed_seconds : float ref +); + +/** + * If extraction was successful, then `cpu_seconds` and + * `elapsed_seconds` are the CPU time and elapsed time (respectively) + * that extraction took for compiler invocation `id`. + * The `result` will indicate the extraction result: + * + * 0: Successfully extracted + * 1: Errors were encountered, but extraction recovered + * 2: Errors were encountered, and extraction could not recover + */ +compilation_finished( + unique int id : @compilation ref, + float cpu_seconds : float ref, + float elapsed_seconds : float ref, + int result : int ref +); + +diagnostics( + unique int id: @diagnostic, + string generated_by: string ref, // TODO: Sync this with the other languages? + int severity: int ref, + string error_tag: string ref, + string error_message: string ref, + string full_error_message: string ref, + int location: @location_default ref +); + +/* + * External artifacts + */ + +externalData( + int id : @externalDataElement, + string path : string ref, + int column: int ref, + string value : string ref +); + +snapshotDate( + unique date snapshotDate : date ref +); + +sourceLocationPrefix( + string prefix : string ref +); + +/* + * Duplicate code + */ + +duplicateCode( + unique int id : @duplication, + string relativePath : string ref, + int equivClass : int ref +); + +similarCode( + unique int id : @similarity, + string relativePath : string ref, + int equivClass : int ref +); + +@duplication_or_similarity = @duplication | @similarity + +tokens( + int id : @duplication_or_similarity ref, + int offset : int ref, + int beginLine : int ref, + int beginColumn : int ref, + int endLine : int ref, + int endColumn : int ref +); + +/* + * SMAP + */ + +smap_header( + int outputFileId: @file ref, + string outputFilename: string ref, + string defaultStratum: string ref +); + +smap_files( + int outputFileId: @file ref, + string stratum: string ref, + int inputFileNum: int ref, + string inputFileName: string ref, + int inputFileId: @file ref +); + +smap_lines( + int outputFileId: @file ref, + string stratum: string ref, + int inputFileNum: int ref, + int inputStartLine: int ref, + int inputLineCount: int ref, + int outputStartLine: int ref, + int outputLineIncrement: int ref +); + +/* + * Locations and files + */ + +@location = @location_default ; + +locations_default( + unique int id: @location_default, + int file: @file ref, + int beginLine: int ref, + int beginColumn: int ref, + int endLine: int ref, + int endColumn: int ref +); + +hasLocation( + int locatableid: @locatable ref, + int id: @location ref +); + +@sourceline = @locatable ; + +#keyset[element_id] +numlines( + int element_id: @sourceline ref, + int num_lines: int ref, + int num_code: int ref, + int num_comment: int ref +); + +files( + unique int id: @file, + string name: string ref +); + +folders( + unique int id: @folder, + string name: string ref +); + +@container = @folder | @file + +containerparent( + int parent: @container ref, + unique int child: @container ref +); + +/* + * Java + */ + +cupackage( + unique int id: @file ref, + int packageid: @package ref +); + +#keyset[fileid,keyName] +jarManifestMain( + int fileid: @file ref, + string keyName: string ref, + string value: string ref +); + +#keyset[fileid,entryName,keyName] +jarManifestEntries( + int fileid: @file ref, + string entryName: string ref, + string keyName: string ref, + string value: string ref +); + +packages( + unique int id: @package, + string nodeName: string ref +); + +primitives( + unique int id: @primitive, + string nodeName: string ref +); + +modifiers( + unique int id: @modifier, + string nodeName: string ref +); + +/** + * An errortype is used when the extractor is unable to extract a type + * correctly for some reason. + */ +error_type( + unique int id: @errortype +); + +classes( + unique int id: @class, + string nodeName: string ref, + int parentid: @package ref, + int sourceid: @class ref +); + +file_class( + int id: @class ref +); + +class_object( + unique int id: @class ref, + unique int instance: @field ref +); + +type_companion_object( + unique int id: @classorinterface ref, + unique int instance: @field ref, + unique int companion_object: @class ref +); + +kt_nullable_types( + unique int id: @kt_nullable_type, + int classid: @reftype ref +) + +kt_notnull_types( + unique int id: @kt_notnull_type, + int classid: @reftype ref +) + +kt_type_alias( + unique int id: @kt_type_alias, + string name: string ref, + int kttypeid: @kt_type ref +) + +@kt_type = @kt_nullable_type | @kt_notnull_type + +isRecord( + unique int id: @class ref +); + +interfaces( + unique int id: @interface, + string nodeName: string ref, + int parentid: @package ref, + int sourceid: @interface ref +); + +fielddecls( + unique int id: @fielddecl, + int parentid: @reftype ref +); + +#keyset[fieldId] #keyset[fieldDeclId,pos] +fieldDeclaredIn( + int fieldId: @field ref, + int fieldDeclId: @fielddecl ref, + int pos: int ref +); + +fields( + unique int id: @field, + string nodeName: string ref, + int typeid: @type ref, + int parentid: @reftype ref, + int sourceid: @field ref +); + +fieldsKotlinType( + unique int id: @field ref, + int kttypeid: @kt_type ref +); + +constrs( + unique int id: @constructor, + string nodeName: string ref, + string signature: string ref, + int typeid: @type ref, + int parentid: @reftype ref, + int sourceid: @constructor ref +); + +constrsKotlinType( + unique int id: @constructor ref, + int kttypeid: @kt_type ref +); + +methods( + unique int id: @method, + string nodeName: string ref, + string signature: string ref, + int typeid: @type ref, + int parentid: @reftype ref, + int sourceid: @method ref +); + +methodsKotlinType( + unique int id: @method ref, + int kttypeid: @kt_type ref +); + +#keyset[parentid,pos] +params( + unique int id: @param, + int typeid: @type ref, + int pos: int ref, + int parentid: @callable ref, + int sourceid: @param ref +); + +paramsKotlinType( + unique int id: @param ref, + int kttypeid: @kt_type ref +); + +paramName( + unique int id: @param ref, + string nodeName: string ref +); + +isVarargsParam( + int param: @param ref +); + +exceptions( + unique int id: @exception, + int typeid: @type ref, + int parentid: @callable ref +); + +isAnnotType( + int interfaceid: @interface ref +); + +isAnnotElem( + int methodid: @method ref +); + +annotValue( + int parentid: @annotation ref, + int id2: @method ref, + unique int value: @expr ref +); + +isEnumType( + int classid: @class ref +); + +isEnumConst( + int fieldid: @field ref +); + +#keyset[parentid,pos] +typeVars( + unique int id: @typevariable, + string nodeName: string ref, + int pos: int ref, + int kind: int ref, // deprecated + int parentid: @classorinterfaceorcallable ref +); + +wildcards( + unique int id: @wildcard, + string nodeName: string ref, + int kind: int ref +); + +#keyset[parentid,pos] +typeBounds( + unique int id: @typebound, + int typeid: @reftype ref, + int pos: int ref, + int parentid: @boundedtype ref +); + +#keyset[parentid,pos] +typeArgs( + int argumentid: @reftype ref, + int pos: int ref, + int parentid: @classorinterfaceorcallable ref +); + +isParameterized( + int memberid: @member ref +); + +isRaw( + int memberid: @member ref +); + +erasure( + unique int memberid: @member ref, + int erasureid: @member ref +); + +#keyset[classid] #keyset[parent] +isAnonymClass( + int classid: @class ref, + int parent: @classinstancexpr ref +); + +#keyset[typeid] #keyset[parent] +isLocalClassOrInterface( + int typeid: @classorinterface ref, + int parent: @localtypedeclstmt ref +); + +isDefConstr( + int constructorid: @constructor ref +); + +#keyset[exprId] +lambdaKind( + int exprId: @lambdaexpr ref, + int bodyKind: int ref +); + +arrays( + unique int id: @array, + string nodeName: string ref, + int elementtypeid: @type ref, + int dimension: int ref, + int componenttypeid: @type ref +); + +enclInReftype( + unique int child: @reftype ref, + int parent: @reftype ref +); + +extendsReftype( + int id1: @reftype ref, + int id2: @classorinterface ref +); + +implInterface( + int id1: @classorarray ref, + int id2: @interface ref +); + +permits( + int id1: @classorinterface ref, + int id2: @classorinterface ref +); + +hasModifier( + int id1: @modifiable ref, + int id2: @modifier ref +); + +imports( + unique int id: @import, + int holder: @classorinterfaceorpackage ref, + string name: string ref, + int kind: int ref +); + +#keyset[parent,idx] +stmts( + unique int id: @stmt, + int kind: int ref, + int parent: @stmtparent ref, + int idx: int ref, + int bodydecl: @callable ref +); + +@stmtparent = @callable | @stmt | @switchexpr | @whenexpr| @stmtexpr; + +case @stmt.kind of + 0 = @block +| 1 = @ifstmt +| 2 = @forstmt +| 3 = @enhancedforstmt +| 4 = @whilestmt +| 5 = @dostmt +| 6 = @trystmt +| 7 = @switchstmt +| 8 = @synchronizedstmt +| 9 = @returnstmt +| 10 = @throwstmt +| 11 = @breakstmt +| 12 = @continuestmt +| 13 = @emptystmt +| 14 = @exprstmt +| 15 = @labeledstmt +| 16 = @assertstmt +| 17 = @localvariabledeclstmt +| 18 = @localtypedeclstmt +| 19 = @constructorinvocationstmt +| 20 = @superconstructorinvocationstmt +| 21 = @case +| 22 = @catchclause +| 23 = @yieldstmt +| 24 = @errorstmt +| 25 = @whenbranch +; + +#keyset[parent,idx] +exprs( + unique int id: @expr, + int kind: int ref, + int typeid: @type ref, + int parent: @exprparent ref, + int idx: int ref +); + +exprsKotlinType( + unique int id: @expr ref, + int kttypeid: @kt_type ref +); + +callableEnclosingExpr( + unique int id: @expr ref, + int callable_id: @callable ref +); + +statementEnclosingExpr( + unique int id: @expr ref, + int statement_id: @stmt ref +); + +isParenthesized( + unique int id: @expr ref, + int parentheses: int ref +); + +case @expr.kind of + 1 = @arrayaccess +| 2 = @arraycreationexpr +| 3 = @arrayinit +| 4 = @assignexpr +| 5 = @assignaddexpr +| 6 = @assignsubexpr +| 7 = @assignmulexpr +| 8 = @assigndivexpr +| 9 = @assignremexpr +| 10 = @assignandexpr +| 11 = @assignorexpr +| 12 = @assignxorexpr +| 13 = @assignlshiftexpr +| 14 = @assignrshiftexpr +| 15 = @assignurshiftexpr +| 16 = @booleanliteral +| 17 = @integerliteral +| 18 = @longliteral +| 19 = @floatingpointliteral +| 20 = @doubleliteral +| 21 = @characterliteral +| 22 = @stringliteral +| 23 = @nullliteral +| 24 = @mulexpr +| 25 = @divexpr +| 26 = @remexpr +| 27 = @addexpr +| 28 = @subexpr +| 29 = @lshiftexpr +| 30 = @rshiftexpr +| 31 = @urshiftexpr +| 32 = @andbitexpr +| 33 = @orbitexpr +| 34 = @xorbitexpr +| 35 = @andlogicalexpr +| 36 = @orlogicalexpr +| 37 = @ltexpr +| 38 = @gtexpr +| 39 = @leexpr +| 40 = @geexpr +| 41 = @eqexpr +| 42 = @neexpr +| 43 = @postincexpr +| 44 = @postdecexpr +| 45 = @preincexpr +| 46 = @predecexpr +| 47 = @minusexpr +| 48 = @plusexpr +| 49 = @bitnotexpr +| 50 = @lognotexpr +| 51 = @castexpr +| 52 = @newexpr +| 53 = @conditionalexpr +| 54 = @parexpr // deprecated +| 55 = @instanceofexpr +| 56 = @localvariabledeclexpr +| 57 = @typeliteral +| 58 = @thisaccess +| 59 = @superaccess +| 60 = @varaccess +| 61 = @methodaccess +| 62 = @unannotatedtypeaccess +| 63 = @arraytypeaccess +| 64 = @packageaccess +| 65 = @wildcardtypeaccess +| 66 = @declannotation +| 67 = @uniontypeaccess +| 68 = @lambdaexpr +| 69 = @memberref +| 70 = @annotatedtypeaccess +| 71 = @typeannotation +| 72 = @intersectiontypeaccess +| 73 = @switchexpr +| 74 = @errorexpr +| 75 = @whenexpr +| 76 = @getclassexpr +| 77 = @safecastexpr +| 78 = @implicitcastexpr +| 79 = @implicitnotnullexpr +| 80 = @implicitcoerciontounitexpr +| 81 = @notinstanceofexpr +| 82 = @stmtexpr +| 83 = @stringtemplateexpr +| 84 = @notnullexpr +| 85 = @unsafecoerceexpr +| 86 = @valueeqexpr +| 87 = @valueneexpr +| 88 = @propertyref +; + +/** Holds if this `when` expression was written as an `if` expression. */ +when_if(unique int id: @whenexpr ref); + +/** Holds if this `when` branch was written as an `else` branch. */ +when_branch_else(unique int id: @whenbranch ref); + +@classinstancexpr = @newexpr | @lambdaexpr | @memberref | @propertyref + +@annotation = @declannotation | @typeannotation +@typeaccess = @unannotatedtypeaccess | @annotatedtypeaccess + +@assignment = @assignexpr + | @assignop; + +@unaryassignment = @postincexpr + | @postdecexpr + | @preincexpr + | @predecexpr; + +@assignop = @assignaddexpr + | @assignsubexpr + | @assignmulexpr + | @assigndivexpr + | @assignremexpr + | @assignandexpr + | @assignorexpr + | @assignxorexpr + | @assignlshiftexpr + | @assignrshiftexpr + | @assignurshiftexpr; + +@literal = @booleanliteral + | @integerliteral + | @longliteral + | @floatingpointliteral + | @doubleliteral + | @characterliteral + | @stringliteral + | @nullliteral; + +@binaryexpr = @mulexpr + | @divexpr + | @remexpr + | @addexpr + | @subexpr + | @lshiftexpr + | @rshiftexpr + | @urshiftexpr + | @andbitexpr + | @orbitexpr + | @xorbitexpr + | @andlogicalexpr + | @orlogicalexpr + | @ltexpr + | @gtexpr + | @leexpr + | @geexpr + | @eqexpr + | @neexpr + | @valueeqexpr + | @valueneexpr; + +@unaryexpr = @postincexpr + | @postdecexpr + | @preincexpr + | @predecexpr + | @minusexpr + | @plusexpr + | @bitnotexpr + | @lognotexpr + | @notnullexpr; + +@caller = @classinstancexpr + | @methodaccess + | @constructorinvocationstmt + | @superconstructorinvocationstmt; + +callableBinding( + unique int callerid: @caller ref, + int callee: @callable ref +); + +memberRefBinding( + unique int id: @expr ref, + int callable: @callable ref +); + +propertyRefGetBinding( + unique int id: @expr ref, + int getter: @callable ref +); + +propertyRefFieldBinding( + unique int id: @expr ref, + int field: @field ref +); + +propertyRefSetBinding( + unique int id: @expr ref, + int setter: @callable ref +); + +@exprparent = @stmt | @expr | @whenbranch | @callable | @field | @fielddecl | @class | @interface | @param | @localvar | @typevariable; + +variableBinding( + unique int expr: @varaccess ref, + int variable: @variable ref +); + +@variable = @localscopevariable | @field; + +@localscopevariable = @localvar | @param; + +localvars( + unique int id: @localvar, + string nodeName: string ref, + int typeid: @type ref, + int parentid: @localvariabledeclexpr ref +); + +localvarsKotlinType( + unique int id: @localvar ref, + int kttypeid: @kt_type ref +); + +@namedexprorstmt = @breakstmt + | @continuestmt + | @labeledstmt + | @literal; + +namestrings( + string name: string ref, + string value: string ref, + unique int parent: @namedexprorstmt ref +); + +/* + * Modules + */ + +#keyset[name] +modules( + unique int id: @module, + string name: string ref +); + +isOpen( + int id: @module ref +); + +#keyset[fileId] +cumodule( + int fileId: @file ref, + int moduleId: @module ref +); + +@directive = @requires + | @exports + | @opens + | @uses + | @provides + +#keyset[directive] +directives( + int id: @module ref, + int directive: @directive ref +); + +requires( + unique int id: @requires, + int target: @module ref +); + +isTransitive( + int id: @requires ref +); + +isStatic( + int id: @requires ref +); + +exports( + unique int id: @exports, + int target: @package ref +); + +exportsTo( + int id: @exports ref, + int target: @module ref +); + +opens( + unique int id: @opens, + int target: @package ref +); + +opensTo( + int id: @opens ref, + int target: @module ref +); + +uses( + unique int id: @uses, + string serviceInterface: string ref +); + +provides( + unique int id: @provides, + string serviceInterface: string ref +); + +providesWith( + int id: @provides ref, + string serviceImpl: string ref +); + +/* + * Javadoc + */ + +javadoc( + unique int id: @javadoc +); + +isNormalComment( + int commentid : @javadoc ref +); + +isEolComment( + int commentid : @javadoc ref +); + +hasJavadoc( + int documentableid: @member ref, + int javadocid: @javadoc ref +); + +#keyset[parentid,idx] +javadocTag( + unique int id: @javadocTag, + string name: string ref, + int parentid: @javadocParent ref, + int idx: int ref +); + +#keyset[parentid,idx] +javadocText( + unique int id: @javadocText, + string text: string ref, + int parentid: @javadocParent ref, + int idx: int ref +); + +@javadocParent = @javadoc | @javadocTag; +@javadocElement = @javadocTag | @javadocText; + +@classorinterface = @interface | @class; +@classorinterfaceorpackage = @classorinterface | @package; +@classorinterfaceorcallable = @classorinterface | @callable; +@boundedtype = @typevariable | @wildcard; +@reftype = @classorinterface | @array | @boundedtype | @errortype; +@classorarray = @class | @array; +@type = @primitive | @reftype; +@callable = @method | @constructor; + +/** A program element that has a name. */ +@element = @package | @modifier | @annotation | @errortype | + @locatableElement; + +@locatableElement = @file | @primitive | @class | @interface | @method | @constructor | @param | @exception | @field | + @boundedtype | @array | @localvar | @expr | @stmt | @import | @fielddecl | @kt_type | @kt_type_alias | + @kt_property; + +@modifiable = @member_modifiable| @param | @localvar | @typevariable; + +@member_modifiable = @class | @interface | @method | @constructor | @field | @kt_property; + +@member = @method | @constructor | @field | @reftype ; + +/** A program element that has a location. */ +@locatable = @typebound | @javadoc | @javadocTag | @javadocText | @xmllocatable | @ktcomment | + @locatableElement; + +@top = @element | @locatable | @folder; + +/* + * XML Files + */ + +xmlEncoding( + unique int id: @file ref, + string encoding: string ref +); + +xmlDTDs( + unique int id: @xmldtd, + string root: string ref, + string publicId: string ref, + string systemId: string ref, + int fileid: @file ref +); + +xmlElements( + unique int id: @xmlelement, + string name: string ref, + int parentid: @xmlparent ref, + int idx: int ref, + int fileid: @file ref +); + +xmlAttrs( + unique int id: @xmlattribute, + int elementid: @xmlelement ref, + string name: string ref, + string value: string ref, + int idx: int ref, + int fileid: @file ref +); + +xmlNs( + int id: @xmlnamespace, + string prefixName: string ref, + string URI: string ref, + int fileid: @file ref +); + +xmlHasNs( + int elementId: @xmlnamespaceable ref, + int nsId: @xmlnamespace ref, + int fileid: @file ref +); + +xmlComments( + unique int id: @xmlcomment, + string text: string ref, + int parentid: @xmlparent ref, + int fileid: @file ref +); + +xmlChars( + unique int id: @xmlcharacters, + string text: string ref, + int parentid: @xmlparent ref, + int idx: int ref, + int isCDATA: int ref, + int fileid: @file ref +); + +@xmlparent = @file | @xmlelement; +@xmlnamespaceable = @xmlelement | @xmlattribute; + +xmllocations( + int xmlElement: @xmllocatable ref, + int location: @location_default ref +); + +@xmllocatable = @xmlcharacters | @xmlelement | @xmlcomment | @xmlattribute | @xmldtd | @file | @xmlnamespace; + +/* + * configuration files with key value pairs + */ + +configs( + unique int id: @config +); + +configNames( + unique int id: @configName, + int config: @config ref, + string name: string ref +); + +configValues( + unique int id: @configValue, + int config: @config ref, + string value: string ref +); + +configLocations( + int locatable: @configLocatable ref, + int location: @location_default ref +); + +@configLocatable = @config | @configName | @configValue; + +ktComments( + unique int id: @ktcomment, + int kind: int ref, + string text : string ref +) + +ktCommentSections( + unique int id: @ktcommentsection, + int comment: @ktcomment ref, + string content : string ref +) + +ktCommentSectionNames( + unique int id: @ktcommentsection ref, + string name : string ref +) + +ktCommentSectionSubjectNames( + unique int id: @ktcommentsection ref, + string subjectname : string ref +) + +#keyset[id, owner] +ktCommentOwners( + int id: @ktcomment ref, + int owner: @top ref +) + +ktExtensionFunctions( + unique int id: @method ref, + int typeid: @type ref, + int kttypeid: @kt_type ref +) + +ktProperties( + unique int id: @kt_property, + string nodeName: string ref +) + +ktPropertyGetters( + unique int id: @kt_property ref, + int getter: @method ref +) + +ktPropertySetters( + unique int id: @kt_property ref, + int setter: @method ref +) + +ktPropertyBackingFields( + unique int id: @kt_property ref, + int backingField: @field ref +) + +ktSyntheticBody( + unique int id: @callable ref, + int kind: int ref + // 1: ENUM_VALUES + // 2: ENUM_VALUEOF +) + +ktLocalFunction( + unique int id: @method ref +) + +ktInitializerAssignment( + unique int id: @assignexpr ref +) + +ktPropertyDelegates( + unique int id: @kt_property ref, + unique int variableId: @variable ref +) + +/** + * If `id` is a compiler generated element, then the kind indicates the + * reason that the compiler generated it. + * See `Element.compilerGeneratedReason()` for an explanation of what + * each `kind` means. + */ +compiler_generated( + unique int id: @element ref, + int kind: int ref +) + +ktFunctionOriginalNames( + unique int id: @method ref, + string name: string ref +) + +ktDataClasses( + unique int id: @class ref +) diff --git a/java/ql/lib/upgrades/709f1d1fd04ffd9bbcf242f17b120f8a389949bd/upgrade.properties b/java/ql/lib/upgrades/709f1d1fd04ffd9bbcf242f17b120f8a389949bd/upgrade.properties new file mode 100644 index 00000000000..1c05ac39dbe --- /dev/null +++ b/java/ql/lib/upgrades/709f1d1fd04ffd9bbcf242f17b120f8a389949bd/upgrade.properties @@ -0,0 +1,2 @@ +description: Add compilation_info +compatibility: backwards diff --git a/java/ql/src/Security/CWE/CWE-020/OverlyLargeRange.ql b/java/ql/src/Security/CWE/CWE-020/OverlyLargeRange.ql index d054659892c..b8ea3e52dbd 100644 --- a/java/ql/src/Security/CWE/CWE-020/OverlyLargeRange.ql +++ b/java/ql/src/Security/CWE/CWE-020/OverlyLargeRange.ql @@ -12,14 +12,15 @@ * external/cwe/cwe-020 */ -import semmle.code.java.security.OverlyLargeRangeQuery +private import semmle.code.java.regex.RegexTreeView::RegexTreeView as TreeView +import codeql.regex.OverlyLargeRangeQuery::Make -RegExpCharacterClass potentialMisparsedCharClass() { +TreeView::RegExpCharacterClass potentialMisparsedCharClass() { // nested char classes are currently misparsed - result.getAChild().(RegExpNormalChar).getValue() = "[" + result.getAChild().(TreeView::RegExpNormalChar).getValue() = "[" } -from RegExpCharacterRange range, string reason +from TreeView::RegExpCharacterRange range, string reason where problem(range, reason) and not range.getParent() = potentialMisparsedCharClass() diff --git a/java/ql/src/Security/CWE/CWE-730/PolynomialReDoS.ql b/java/ql/src/Security/CWE/CWE-730/PolynomialReDoS.ql index 75cd8335fac..a84f1c5213e 100644 --- a/java/ql/src/Security/CWE/CWE-730/PolynomialReDoS.ql +++ b/java/ql/src/Security/CWE/CWE-730/PolynomialReDoS.ql @@ -17,7 +17,9 @@ import java import semmle.code.java.security.regexp.PolynomialReDoSQuery import DataFlow::PathGraph -from DataFlow::PathNode source, DataFlow::PathNode sink, PolynomialBackTrackingTerm regexp +from + DataFlow::PathNode source, DataFlow::PathNode sink, + SuperlinearBackTracking::PolynomialBackTrackingTerm regexp where hasPolynomialReDoSResult(source, sink, regexp) select sink, source, sink, "This $@ that depends on a $@ may run slow on strings " + regexp.getPrefixMessage() + diff --git a/java/ql/src/Security/CWE/CWE-730/ReDoS.ql b/java/ql/src/Security/CWE/CWE-730/ReDoS.ql index 23e258e8915..ca4750fc858 100644 --- a/java/ql/src/Security/CWE/CWE-730/ReDoS.ql +++ b/java/ql/src/Security/CWE/CWE-730/ReDoS.ql @@ -14,12 +14,12 @@ * external/cwe/cwe-400 */ -import java -import semmle.code.java.security.regexp.ExponentialBackTracking +private import semmle.code.java.regex.RegexTreeView::RegexTreeView as TreeView +import codeql.regex.nfa.ExponentialBackTracking::Make as ExponentialBackTracking -from RegExpTerm t, string pump, State s, string prefixMsg +from TreeView::RegExpTerm t, string pump, ExponentialBackTracking::State s, string prefixMsg where - hasReDoSResult(t, pump, s, prefixMsg) and + ExponentialBackTracking::hasReDoSResult(t, pump, s, prefixMsg) and // exclude verbose mode regexes for now not t.getRegex().getAMode() = "VERBOSE" select t, diff --git a/java/ql/src/Telemetry/ExtractorInformation.ql b/java/ql/src/Telemetry/ExtractorInformation.ql index 0eb420ba651..48eb49a1b07 100644 --- a/java/ql/src/Telemetry/ExtractorInformation.ql +++ b/java/ql/src/Telemetry/ExtractorInformation.ql @@ -9,6 +9,13 @@ import java import semmle.code.java.Diagnostics +predicate compilationInfo(string key, int value) { + exists(Compilation c, string infoKey | + key = infoKey + ": " + c.getInfo(infoKey) and + value = 1 + ) +} + predicate fileCount(string key, int value) { key = "Number of files" and value = strictcount(File f) @@ -53,13 +60,38 @@ predicate extractorDiagnostics(string key, int value) { ) } +/* + * Just counting the diagnostics doesn't give the full picture, as + * CODEQL_EXTRACTOR_KOTLIN_DIAGNOSTIC_LIMIT means that some diagnostics + * will be suppressed. In that case, we need to look for the + * suppression message, uncount those that did get emitted, uncount the + * suppression message itself, and then add on the full count. + */ + +predicate extractorTotalDiagnostics(string key, int value) { + exists(string extractor, string limitRegex | + limitRegex = "Total of ([0-9]+) diagnostics \\(reached limit of ([0-9]+)\\).*" and + key = "Total number of diagnostics from " + extractor and + value = + strictcount(Diagnostic d | d.getGeneratedBy() = extractor) + + sum(Diagnostic d | + d.getGeneratedBy() = extractor + | + d.getMessage().regexpCapture(limitRegex, 1).toInt() - + d.getMessage().regexpCapture(limitRegex, 2).toInt() - 1 + ) + ) +} + from string key, int value where + compilationInfo(key, value) or fileCount(key, value) or fileCountByExtension(key, value) or totalNumberOfLines(key, value) or numberOfLinesOfCode(key, value) or totalNumberOfLinesByExtension(key, value) or numberOfLinesOfCodeByExtension(key, value) or - extractorDiagnostics(key, value) + extractorDiagnostics(key, value) or + extractorTotalDiagnostics(key, value) select key, value diff --git a/java/ql/src/utils/model-generator/CaptureNegativeSummaryModels.ql b/java/ql/src/utils/model-generator/CaptureNegativeSummaryModels.ql index 49ed16aa1dc..a6f6e5f26e5 100644 --- a/java/ql/src/utils/model-generator/CaptureNegativeSummaryModels.ql +++ b/java/ql/src/utils/model-generator/CaptureNegativeSummaryModels.ql @@ -6,8 +6,8 @@ * @tags model-generator */ -import internal.CaptureModels -import internal.CaptureSummaryFlow +import utils.modelgenerator.internal.CaptureModels +import utils.modelgenerator.internal.CaptureSummaryFlow from DataFlowTargetApi api, string noflow where noflow = captureNoFlow(api) diff --git a/java/ql/src/utils/model-generator/CaptureSinkModels.ql b/java/ql/src/utils/model-generator/CaptureSinkModels.ql index 740e5aa119a..f047a8a13af 100644 --- a/java/ql/src/utils/model-generator/CaptureSinkModels.ql +++ b/java/ql/src/utils/model-generator/CaptureSinkModels.ql @@ -6,7 +6,7 @@ * @tags model-generator */ -import internal.CaptureModels +import utils.modelgenerator.internal.CaptureModels class Activate extends ActiveConfiguration { override predicate activateToSinkConfig() { any() } diff --git a/java/ql/src/utils/model-generator/CaptureSourceModels.ql b/java/ql/src/utils/model-generator/CaptureSourceModels.ql index 77a00602f3f..7dfc8e0ad34 100644 --- a/java/ql/src/utils/model-generator/CaptureSourceModels.ql +++ b/java/ql/src/utils/model-generator/CaptureSourceModels.ql @@ -6,7 +6,7 @@ * @tags model-generator */ -import internal.CaptureModels +import utils.modelgenerator.internal.CaptureModels class Activate extends ActiveConfiguration { override predicate activateFromSourceConfig() { any() } diff --git a/java/ql/src/utils/model-generator/CaptureSummaryModels.ql b/java/ql/src/utils/model-generator/CaptureSummaryModels.ql index 981c6fe73fc..a8d23ca5b34 100644 --- a/java/ql/src/utils/model-generator/CaptureSummaryModels.ql +++ b/java/ql/src/utils/model-generator/CaptureSummaryModels.ql @@ -6,8 +6,8 @@ * @tags model-generator */ -import internal.CaptureModels -import internal.CaptureSummaryFlow +import utils.modelgenerator.internal.CaptureModels +import utils.modelgenerator.internal.CaptureSummaryFlow from DataFlowTargetApi api, string flow where flow = captureFlow(api) diff --git a/java/ql/src/utils/model-generator/CaptureTypeBasedSummaryModels.ql b/java/ql/src/utils/model-generator/CaptureTypeBasedSummaryModels.ql new file mode 100644 index 00000000000..1cdb5fe959f --- /dev/null +++ b/java/ql/src/utils/model-generator/CaptureTypeBasedSummaryModels.ql @@ -0,0 +1,13 @@ +/** + * @name Capture typed based summary models. + * @description Finds applicable summary models to be used by other queries. + * @kind diagnostic + * @id java/utils/model-generator/summary-models-typed-based + * @tags model-generator + */ + +import utils.modelgenerator.internal.CaptureTypeBasedSummaryModels + +from TypeBasedFlowTargetApi api, string flow +where flow = captureFlow(api) +select flow order by flow diff --git a/java/ql/src/utils/model-generator/internal/CaptureModels.qll b/java/ql/src/utils/modelgenerator/internal/CaptureModels.qll similarity index 100% rename from java/ql/src/utils/model-generator/internal/CaptureModels.qll rename to java/ql/src/utils/modelgenerator/internal/CaptureModels.qll diff --git a/java/ql/src/utils/model-generator/internal/CaptureModelsSpecific.qll b/java/ql/src/utils/modelgenerator/internal/CaptureModelsSpecific.qll similarity index 98% rename from java/ql/src/utils/model-generator/internal/CaptureModelsSpecific.qll rename to java/ql/src/utils/modelgenerator/internal/CaptureModelsSpecific.qll index 64516fbaded..6e9fe7c29b2 100644 --- a/java/ql/src/utils/model-generator/internal/CaptureModelsSpecific.qll +++ b/java/ql/src/utils/modelgenerator/internal/CaptureModelsSpecific.qll @@ -67,6 +67,8 @@ private predicate isRelevantForModels(J::Callable api) { */ predicate isRelevantForDataFlowModels = isRelevantForModels/1; +predicate isRelevantForTypeBasedFlowModels = isRelevantForModels/1; + /** * A class of Callables that are relevant for generating summary, source and sinks models for. * @@ -141,7 +143,7 @@ string asPartialNegativeModel(TargetApiSpecific api) { ) } -private predicate isPrimitiveTypeUsedForBulkData(J::Type t) { +predicate isPrimitiveTypeUsedForBulkData(J::Type t) { t.hasName(["byte", "char", "Byte", "Character"]) } diff --git a/java/ql/src/utils/model-generator/internal/CaptureSummaryFlow.qll b/java/ql/src/utils/modelgenerator/internal/CaptureSummaryFlow.qll similarity index 100% rename from java/ql/src/utils/model-generator/internal/CaptureSummaryFlow.qll rename to java/ql/src/utils/modelgenerator/internal/CaptureSummaryFlow.qll diff --git a/java/ql/src/utils/modelgenerator/internal/CaptureTypeBasedSummaryModels.qll b/java/ql/src/utils/modelgenerator/internal/CaptureTypeBasedSummaryModels.qll new file mode 100644 index 00000000000..3f7f4dd97e6 --- /dev/null +++ b/java/ql/src/utils/modelgenerator/internal/CaptureTypeBasedSummaryModels.qll @@ -0,0 +1,330 @@ +private import java +private import semmle.code.java.Collections +private import semmle.code.java.dataflow.internal.ContainerFlow +private import CaptureModelsSpecific as Specific +private import CaptureModels + +/** + * A type representing instantiations of class types + * that has a method which returns an iterator. + */ +private class IterableClass extends Class { + private Type elementType; + + IterableClass() { + elementType = + unique(Type et | + exists(Method m, RefType return, GenericType t, int position | m.getDeclaringType() = t | + return = m.getReturnType() and + return.getSourceDeclaration().hasQualifiedName("java.util", "Iterator") and + t.getTypeParameter(position) = return.(ParameterizedType).getTypeArgument(0) and + instantiates(this, t, position, et) + ) + ) + } + + /** + * Returns the iterator element type of `this`. + */ + Type getElementType() { result = elementType } +} + +/** + * Holds if type `bound` is an upper bound for type `t` or equal to `t`. + */ +private predicate isEffectivelyUpperBound(Type t, Type bound) { + t = bound or + t.(Wildcard).getUpperBound().getType() = bound +} + +/** + * Holds if type `bound` is a lower bound for type `t` or equal to `t`. + */ +private predicate isEffectivelyLowerBound(Type t, Type bound) { + t = bound or + t.(Wildcard).getLowerBound().getType() = bound +} + +/** + * Holds if `t` is a container like type of `tv` (eg. `List `). + */ +private predicate genericContainerType(RefType t, TypeVariable tv) { + exists(Type et | + et = + [ + t.(ContainerType).getElementType(), t.(IterableClass).getElementType(), + t.(Array).getElementType() + ] + | + isEffectivelyUpperBound(et, tv) + ) +} + +/** + * Holds if `tv` is a type variable of the immediate type declaring `callable`. + */ +private predicate classTypeParameter(Callable callable, TypeVariable tv) { + callable.getDeclaringType().(GenericType).getATypeParameter() = tv +} + +/** + * Holds if `tv` is type variable of `callable` or the type declaring `callable`. + */ +private predicate localTypeParameter(Callable callable, TypeVariable tv) { + classTypeParameter(callable, tv) or + callable.(GenericCallable).getATypeParameter() = tv +} + +/** + * Gets the access path postfix for `t`. + */ +private string getAccessPath(Type t) { + if + t instanceof Array and + not Specific::isPrimitiveTypeUsedForBulkData(t.(Array).getElementType()) + then result = ".ArrayElement" + else + if t instanceof ContainerType or t instanceof IterableClass + then result = ".Element" + else result = "" +} + +/** + * Gets the access path for parameter `p`. + */ +private string parameterAccess(Parameter p) { + result = "Argument[" + p.getPosition() + "]" + getAccessPath(p.getType()) +} + +/** + * Holds if `callable` has a type parameter `tv` or container parameterized over type `tv`. + */ +private predicate parameter(Callable callable, string input, TypeVariable tv) { + exists(Parameter p, Type pt | + input = parameterAccess(p) and + p = callable.getAParameter() and + pt = p.getType() and + ( + // Parameter of type tv + isEffectivelyUpperBound(pt, tv) + or + // Parameter is a container of type tv + genericContainerType(pt, tv) + ) + ) +} + +/** + * Gets the string representation of a synthetic field corresponding to `tv`. + */ +private string getSyntheticField(TypeVariable tv) { + result = ".SyntheticField[ArgType" + tv.getIndex() + "]" +} + +/** + * Gets a models as data string representation of, how a value of type `tv` + * can be read or stored implicitly in relation to `callable`. + */ +private string implicit(Callable callable, TypeVariable tv) { + classTypeParameter(callable, tv) and + not callable.isStatic() and + exists(string access, Type decl | + decl = callable.getDeclaringType() and + if genericContainerType(decl, tv) + then access = getAccessPath(decl) + else access = getSyntheticField(tv) + | + result = Specific::qualifierString() + access + ) +} + +private class GenericFunctionalInterface extends FunctionalInterface, GenericType { + override string getAPrimaryQlClass() { result = "GenericFunctionalInterface" } +} + +/** + * A class of types that represents functions. + */ +private class Function extends ParameterizedType { + private GenericFunctionalInterface fi; + + Function() { fi = this.getGenericType() } + + /** + * Gets the typevariable that is the placeholder for the type `t` + * used in the instantiation of `this`. + */ + private TypeVariable getTypeReplacement(Type t) { + exists(int position | + instantiates(this, fi, position, t) and + result = fi.getTypeParameter(position) + ) + } + + /** + * Gets the parameter type of `this` function at position `position`. + * Note that functions are often contravariant in their parameter types. + */ + Type getParameterType(int position) { + exists(Type t | + fi.getRunMethod().getParameterType(position) = this.getTypeReplacement(t) and + isEffectivelyLowerBound(t, result) + ) + } + + /** + * Gets the return type of `this` function. + * Note that functions are often covariant in their return type. + */ + Type getReturnType() { + exists(Type t | + fi.getRunMethod().getReturnType() = this.getTypeReplacement(t) and + isEffectivelyUpperBound(t, result) + ) + } +} + +/** + * Holds if `callable` has a function parameter `f` at parameter position `position`. + */ +private predicate functional(Callable callable, Function f, int position) { + callable.getParameterType(position) = f +} + +/** + * Gets models as data input/output access relative to the type parameter `tv` in the + * type `t` in the scope of `callable`. + * + * Note: This predicate has to be inlined as `callable` is not related to `return` or `tv` + * in every disjunction. + */ +bindingset[callable] +private string getAccess(Callable callable, Type return, TypeVariable tv) { + return = tv and result = "" + or + genericContainerType(return, tv) and result = getAccessPath(return) + or + not genericContainerType(return, tv) and + ( + return.(ParameterizedType).getATypeArgument() = tv + or + callable.getDeclaringType() = return and return.(GenericType).getATypeParameter() = tv + ) and + result = getSyntheticField(tv) +} + +/** + * Holds if `input` is a models as data string representation of, how a value of type `tv` + * (or a generic parameterized over `tv`) can be generated by a function parameter of `callable`. + */ +private predicate functionalSource(Callable callable, string input, TypeVariable tv) { + exists(Function f, int position, Type return, string access | + functional(callable, f, position) and + return = f.getReturnType() and + access = getAccess(callable, return, tv) and + input = "Argument[" + position + "].ReturnValue" + access + ) +} + +/** + * Holds if `input` is a models as data string representation of, how a + * value of type `tv` (or a generic parameterized over `tv`) + * can be provided as input to `callable`. + * This includes + * (1) The implicit synthetic field(s) of the declaring type of `callable`. + * (2) The parameters of `callable`. + * (3) Any function parameters of `callable`. + */ +private predicate input(Callable callable, string input, TypeVariable tv) { + input = implicit(callable, tv) + or + parameter(callable, input, tv) + or + functionalSource(callable, input, tv) +} + +/** + * Holds if `callable` returns a value of type `tv` (or a generic parameterized over `tv`) and `output` + * is a models as data string representation of, how data is routed to the return. + */ +private predicate returns(Callable callable, TypeVariable tv, string output) { + exists(Type return, string access | return = callable.getReturnType() | + access = getAccess(callable, return, tv) and + output = "ReturnValue" + access + ) +} + +/** + * Holds if `callable` has a function parameter that accepts a value of type `tv` + * and `output` is the models as data string representation of, how data is routed to + * the function parameter. + */ +private predicate functionalSink(Callable callable, TypeVariable tv, string output) { + exists(Function f, int p1, int p2 | + functional(callable, f, p1) and + tv = f.getParameterType(p2) and + output = "Argument[" + p1 + "]" + ".Parameter[" + p2 + "]" + ) +} + +/** + * Holds if `output` is a models as data string representation of, how values of type `tv` + * (or generics parameterized over `tv`) can be routed. + * This includes + * (1) The implicit synthetic field(s) of the declaring type of `callable`. + * (2) The return of `callable`. + * (3) Any function parameters of `callable`. + */ +private predicate output(Callable callable, TypeVariable tv, string output) { + output = implicit(callable, tv) + or + returns(callable, tv, output) + or + functionalSink(callable, tv, output) +} + +/** + * A class of callables that are relevant generating summaries for based + * on the Theorems for Free approach. + */ +class TypeBasedFlowTargetApi extends Specific::TargetApiSpecific { + TypeBasedFlowTargetApi() { Specific::isRelevantForTypeBasedFlowModels(this) } + + /** + * Gets the string representation of all type based summaries for `this` + * inspired by the Theorems for Free approach. + * + * Examples could be (see Java pseudo code below) + * (1) `get` returns a value of type `T`. We assume that the returned + * value was fetched from a (synthetic) field. + * (2) `set` consumes a value of type `T`. We assume that the value is stored in + * a (synthetic) field. + * (3) `apply ` is assumed to apply the provided function to a value stored in + * a (synthetic) field and return the result. + * (4) `apply` is assumed to apply the provided function to provided value + * and return the result. + * ```java + * public class MyGeneric { + * public void set(T x) { ... } + * public T get() { ... } + * public S apply(Functionf) { ... } + * public S2 apply (S1 x, Function f) { ... } + * } + * ``` + */ + string getSummaries() { + exists(TypeVariable tv, string input, string output | + localTypeParameter(this, tv) and + input(this, input, tv) and + output(this, tv, output) and + input != output + | + result = asValueModel(this, input, output) + ) + } +} + +/** + * Returns the Theorems for Free inspired typed based summaries for `api`. + */ +string captureFlow(TypeBasedFlowTargetApi api) { result = api.getSummaries() } diff --git a/java/ql/test/library-tests/regex/parser/RegexParseTests.ql b/java/ql/test/library-tests/regex/parser/RegexParseTests.ql index 345031a3b2d..4c8d7519f14 100644 --- a/java/ql/test/library-tests/regex/parser/RegexParseTests.ql +++ b/java/ql/test/library-tests/regex/parser/RegexParseTests.ql @@ -1,10 +1,12 @@ import java -import semmle.code.java.regex.RegexTreeView -import semmle.code.java.regex.regex +import semmle.code.java.regex.RegexTreeView as RegexTreeView +import semmle.code.java.regex.regex as Regex -string getQLClases(RegExpTerm t) { result = "[" + strictconcat(t.getPrimaryQLClass(), ",") + "]" } +string getQLClases(RegexTreeView::RegExpTerm t) { + result = "[" + strictconcat(t.getPrimaryQLClass(), ",") + "]" +} -query predicate parseFailures(Regex r, int i) { r.failedToParse(i) } +query predicate parseFailures(Regex::Regex r, int i) { r.failedToParse(i) } -from RegExpTerm t +from RegexTreeView::RegExpTerm t select t, getQLClases(t) diff --git a/java/ql/test/query-tests/security/CWE-730/PolynomialReDoS.ql b/java/ql/test/query-tests/security/CWE-730/PolynomialReDoS.ql index bd600a6d8af..c8c1566a7a4 100644 --- a/java/ql/test/query-tests/security/CWE-730/PolynomialReDoS.ql +++ b/java/ql/test/query-tests/security/CWE-730/PolynomialReDoS.ql @@ -1,4 +1,3 @@ -import java import TestUtilities.InlineExpectationsTest import semmle.code.java.security.regexp.PolynomialReDoSQuery @@ -9,7 +8,10 @@ class HasPolyRedos extends InlineExpectationsTest { override predicate hasActualResult(Location location, string element, string tag, string value) { tag = "hasPolyRedos" and - exists(DataFlow::PathNode source, DataFlow::PathNode sink, PolynomialBackTrackingTerm regexp | + exists( + DataFlow::PathNode source, DataFlow::PathNode sink, + SuperlinearBackTracking::PolynomialBackTrackingTerm regexp + | hasPolynomialReDoSResult(source, sink, regexp) and location = sink.getNode().getLocation() and element = sink.getNode().toString() and diff --git a/java/ql/test/query-tests/security/CWE-730/ReDoS.ql b/java/ql/test/query-tests/security/CWE-730/ReDoS.ql index 288ca57f2e2..7226541bcb2 100644 --- a/java/ql/test/query-tests/security/CWE-730/ReDoS.ql +++ b/java/ql/test/query-tests/security/CWE-730/ReDoS.ql @@ -1,6 +1,7 @@ import java import TestUtilities.InlineExpectationsTest -import semmle.code.java.security.regexp.ExponentialBackTracking +private import semmle.code.java.regex.RegexTreeView::RegexTreeView as TreeView +import codeql.regex.nfa.ExponentialBackTracking::Make as ExponentialBackTracking import semmle.code.java.regex.regex class HasExpRedos extends InlineExpectationsTest { @@ -10,8 +11,8 @@ class HasExpRedos extends InlineExpectationsTest { override predicate hasActualResult(Location location, string element, string tag, string value) { tag = "hasExpRedos" and - exists(RegExpTerm t, string pump, State s, string prefixMsg | - hasReDoSResult(t, pump, s, prefixMsg) and + exists(TreeView::RegExpTerm t, string pump, ExponentialBackTracking::State s, string prefixMsg | + ExponentialBackTracking::hasReDoSResult(t, pump, s, prefixMsg) and not t.getRegex().getAMode() = "VERBOSE" and value = "" and location = t.getLocation() and diff --git a/java/ql/test/utils/model-generator/CaptureNegativeSummaryModels.expected b/java/ql/test/utils/model-generator/dataflow/CaptureNegativeSummaryModels.expected similarity index 100% rename from java/ql/test/utils/model-generator/CaptureNegativeSummaryModels.expected rename to java/ql/test/utils/model-generator/dataflow/CaptureNegativeSummaryModels.expected diff --git a/java/ql/test/utils/model-generator/CaptureNegativeSummaryModels.qlref b/java/ql/test/utils/model-generator/dataflow/CaptureNegativeSummaryModels.qlref similarity index 100% rename from java/ql/test/utils/model-generator/CaptureNegativeSummaryModels.qlref rename to java/ql/test/utils/model-generator/dataflow/CaptureNegativeSummaryModels.qlref diff --git a/java/ql/test/utils/model-generator/CaptureSinkModels.expected b/java/ql/test/utils/model-generator/dataflow/CaptureSinkModels.expected similarity index 100% rename from java/ql/test/utils/model-generator/CaptureSinkModels.expected rename to java/ql/test/utils/model-generator/dataflow/CaptureSinkModels.expected diff --git a/java/ql/test/utils/model-generator/CaptureSinkModels.qlref b/java/ql/test/utils/model-generator/dataflow/CaptureSinkModels.qlref similarity index 100% rename from java/ql/test/utils/model-generator/CaptureSinkModels.qlref rename to java/ql/test/utils/model-generator/dataflow/CaptureSinkModels.qlref diff --git a/java/ql/test/utils/model-generator/CaptureSourceModels.expected b/java/ql/test/utils/model-generator/dataflow/CaptureSourceModels.expected similarity index 100% rename from java/ql/test/utils/model-generator/CaptureSourceModels.expected rename to java/ql/test/utils/model-generator/dataflow/CaptureSourceModels.expected diff --git a/java/ql/test/utils/model-generator/CaptureSourceModels.qlref b/java/ql/test/utils/model-generator/dataflow/CaptureSourceModels.qlref similarity index 100% rename from java/ql/test/utils/model-generator/CaptureSourceModels.qlref rename to java/ql/test/utils/model-generator/dataflow/CaptureSourceModels.qlref diff --git a/java/ql/test/utils/model-generator/CaptureSummaryModels.expected b/java/ql/test/utils/model-generator/dataflow/CaptureSummaryModels.expected similarity index 100% rename from java/ql/test/utils/model-generator/CaptureSummaryModels.expected rename to java/ql/test/utils/model-generator/dataflow/CaptureSummaryModels.expected diff --git a/java/ql/test/utils/model-generator/CaptureSummaryModels.qlref b/java/ql/test/utils/model-generator/dataflow/CaptureSummaryModels.qlref similarity index 100% rename from java/ql/test/utils/model-generator/CaptureSummaryModels.qlref rename to java/ql/test/utils/model-generator/dataflow/CaptureSummaryModels.qlref diff --git a/java/ql/test/utils/model-generator/p/AbstractImplOfExternalSPI.java b/java/ql/test/utils/model-generator/dataflow/p/AbstractImplOfExternalSPI.java similarity index 100% rename from java/ql/test/utils/model-generator/p/AbstractImplOfExternalSPI.java rename to java/ql/test/utils/model-generator/dataflow/p/AbstractImplOfExternalSPI.java diff --git a/java/ql/test/utils/model-generator/p/Factory.java b/java/ql/test/utils/model-generator/dataflow/p/Factory.java similarity index 100% rename from java/ql/test/utils/model-generator/p/Factory.java rename to java/ql/test/utils/model-generator/dataflow/p/Factory.java diff --git a/java/ql/test/utils/model-generator/p/FinalClass.java b/java/ql/test/utils/model-generator/dataflow/p/FinalClass.java similarity index 100% rename from java/ql/test/utils/model-generator/p/FinalClass.java rename to java/ql/test/utils/model-generator/dataflow/p/FinalClass.java diff --git a/java/ql/test/utils/model-generator/p/FluentAPI.java b/java/ql/test/utils/model-generator/dataflow/p/FluentAPI.java similarity index 100% rename from java/ql/test/utils/model-generator/p/FluentAPI.java rename to java/ql/test/utils/model-generator/dataflow/p/FluentAPI.java diff --git a/java/ql/test/utils/model-generator/p/ImmutablePojo.java b/java/ql/test/utils/model-generator/dataflow/p/ImmutablePojo.java similarity index 100% rename from java/ql/test/utils/model-generator/p/ImmutablePojo.java rename to java/ql/test/utils/model-generator/dataflow/p/ImmutablePojo.java diff --git a/java/ql/test/utils/model-generator/p/ImplOfExternalSPI.java b/java/ql/test/utils/model-generator/dataflow/p/ImplOfExternalSPI.java similarity index 100% rename from java/ql/test/utils/model-generator/p/ImplOfExternalSPI.java rename to java/ql/test/utils/model-generator/dataflow/p/ImplOfExternalSPI.java diff --git a/java/ql/test/utils/model-generator/p/InnerClasses.java b/java/ql/test/utils/model-generator/dataflow/p/InnerClasses.java similarity index 100% rename from java/ql/test/utils/model-generator/p/InnerClasses.java rename to java/ql/test/utils/model-generator/dataflow/p/InnerClasses.java diff --git a/java/ql/test/utils/model-generator/p/InnerHolder.java b/java/ql/test/utils/model-generator/dataflow/p/InnerHolder.java similarity index 100% rename from java/ql/test/utils/model-generator/p/InnerHolder.java rename to java/ql/test/utils/model-generator/dataflow/p/InnerHolder.java diff --git a/java/ql/test/utils/model-generator/p/Joiner.java b/java/ql/test/utils/model-generator/dataflow/p/Joiner.java similarity index 100% rename from java/ql/test/utils/model-generator/p/Joiner.java rename to java/ql/test/utils/model-generator/dataflow/p/Joiner.java diff --git a/java/ql/test/utils/model-generator/p/MultipleImpls.java b/java/ql/test/utils/model-generator/dataflow/p/MultipleImpls.java similarity index 100% rename from java/ql/test/utils/model-generator/p/MultipleImpls.java rename to java/ql/test/utils/model-generator/dataflow/p/MultipleImpls.java diff --git a/java/ql/test/utils/model-generator/p/ParamFlow.java b/java/ql/test/utils/model-generator/dataflow/p/ParamFlow.java similarity index 100% rename from java/ql/test/utils/model-generator/p/ParamFlow.java rename to java/ql/test/utils/model-generator/dataflow/p/ParamFlow.java diff --git a/java/ql/test/utils/model-generator/p/Pojo.java b/java/ql/test/utils/model-generator/dataflow/p/Pojo.java similarity index 100% rename from java/ql/test/utils/model-generator/p/Pojo.java rename to java/ql/test/utils/model-generator/dataflow/p/Pojo.java diff --git a/java/ql/test/utils/model-generator/p/PrivateFlowViaPublicInterface.java b/java/ql/test/utils/model-generator/dataflow/p/PrivateFlowViaPublicInterface.java similarity index 100% rename from java/ql/test/utils/model-generator/p/PrivateFlowViaPublicInterface.java rename to java/ql/test/utils/model-generator/dataflow/p/PrivateFlowViaPublicInterface.java diff --git a/java/ql/test/utils/model-generator/p/Sinks.java b/java/ql/test/utils/model-generator/dataflow/p/Sinks.java similarity index 100% rename from java/ql/test/utils/model-generator/p/Sinks.java rename to java/ql/test/utils/model-generator/dataflow/p/Sinks.java diff --git a/java/ql/test/utils/model-generator/p/SomeEnum.java b/java/ql/test/utils/model-generator/dataflow/p/SomeEnum.java similarity index 100% rename from java/ql/test/utils/model-generator/p/SomeEnum.java rename to java/ql/test/utils/model-generator/dataflow/p/SomeEnum.java diff --git a/java/ql/test/utils/model-generator/p/Sources.java b/java/ql/test/utils/model-generator/dataflow/p/Sources.java similarity index 100% rename from java/ql/test/utils/model-generator/p/Sources.java rename to java/ql/test/utils/model-generator/dataflow/p/Sources.java diff --git a/java/ql/test/utils/model-generator/typebasedflow/CaptureTypeBasedSummaryModels.expected b/java/ql/test/utils/model-generator/typebasedflow/CaptureTypeBasedSummaryModels.expected new file mode 100644 index 00000000000..ee55a9c6ba6 --- /dev/null +++ b/java/ql/test/utils/model-generator/typebasedflow/CaptureTypeBasedSummaryModels.expected @@ -0,0 +1,2 @@ +unexpectedSummary +expectedSummary diff --git a/java/ql/test/utils/model-generator/typebasedflow/CaptureTypeBasedSummaryModels.ql b/java/ql/test/utils/model-generator/typebasedflow/CaptureTypeBasedSummaryModels.ql new file mode 100644 index 00000000000..d1f5c9c520e --- /dev/null +++ b/java/ql/test/utils/model-generator/typebasedflow/CaptureTypeBasedSummaryModels.ql @@ -0,0 +1,26 @@ +import java +import utils.modelgenerator.internal.CaptureTypeBasedSummaryModels + +private string expects() { + exists(Javadoc doc | + doc.getChild(0).toString().regexpCapture(" *(SPURIOUS-)?MaD=(.*)", 2) = result + ) +} + +private string flows() { exists(TypeBasedFlowTargetApi api | result = captureFlow(api)) } + +query predicate unexpectedSummary(string msg) { + exists(string flow | + flow = flows() and + not flow = expects() and + msg = "Unexpected summary found: " + flow + ) +} + +query predicate expectedSummary(string msg) { + exists(string e | + e = expects() and + not e = flows() and + msg = "Expected summary missing: " + e + ) +} diff --git a/java/ql/test/utils/model-generator/typebasedflow/p/MyFunction.java b/java/ql/test/utils/model-generator/typebasedflow/p/MyFunction.java new file mode 100644 index 00000000000..2dea243bf98 --- /dev/null +++ b/java/ql/test/utils/model-generator/typebasedflow/p/MyFunction.java @@ -0,0 +1,10 @@ +package p; + +@FunctionalInterface +public interface MyFunction { + + // MaD=p;MyFunction;true;apply;(Object,Object);;Argument[-1].SyntheticField[ArgType2];ReturnValue;value;generated + // MaD=p;MyFunction;true;apply;(Object,Object);;Argument[0];Argument[-1].SyntheticField[ArgType0];value;generated + // MaD=p;MyFunction;true;apply;(Object,Object);;Argument[1];Argument[-1].SyntheticField[ArgType1];value;generated + T3 apply(T1 x, T2 y); +} \ No newline at end of file diff --git a/java/ql/test/utils/model-generator/typebasedflow/p/Stream.java b/java/ql/test/utils/model-generator/typebasedflow/p/Stream.java new file mode 100644 index 00000000000..c148df6a91b --- /dev/null +++ b/java/ql/test/utils/model-generator/typebasedflow/p/Stream.java @@ -0,0 +1,247 @@ +package p; + +import java.util.*; +import java.util.function.*; +import java.util.stream.LongStream; +import java.util.stream.IntStream; +import java.util.stream.DoubleStream; +import java.util.stream.Collector; + +/** + * This is a stub implementation of the Java Stream API. + */ +public class Stream { + + // MaD=p;Stream;true;iterator;();;Argument[-1].Element;ReturnValue.Element;value;generated + public Iterator iterator() { + return null; + } + + // MaD=p;Stream;true;allMatch;(Predicate);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + public boolean allMatch(Predicate super T> predicate) { + return false; + } + + // MaD=p;Stream;true;collect;(Supplier,BiConsumer,BiConsumer);;Argument[-1].Element;Argument[1].Parameter[1];value;generated + // MaD=p;Stream;true;collect;(Supplier,BiConsumer,BiConsumer);;Argument[0].ReturnValue;Argument[1].Parameter[0];value;generated + // MaD=p;Stream;true;collect;(Supplier,BiConsumer,BiConsumer);;Argument[0].ReturnValue;Argument[2].Parameter[0];value;generated + // MaD=p;Stream;true;collect;(Supplier,BiConsumer,BiConsumer);;Argument[0].ReturnValue;Argument[2].Parameter[1];value;generated + // MaD=p;Stream;true;collect;(Supplier,BiConsumer,BiConsumer);;Argument[0].ReturnValue;ReturnValue;value;generated + public R collect(Supplier supplier, BiConsumer accumulator, BiConsumer combiner) { + return null; + } + + // Collector is not a functional interface, so this is not supported + public R collect(Collector super T, A, R> collector) { + return null; + } + + // MaD=p;Stream;true;concat;(Stream,Stream);;Argument[0].Element;ReturnValue.Element;value;generated + // MaD=p;Stream;true;concat;(Stream,Stream);;Argument[1].Element;ReturnValue.Element;value;generated + public static Stream concat(Stream extends T> a, Stream extends T> b) { + return null; + } + + // MaD=p;Stream;true;distinct;();;Argument[-1].Element;ReturnValue.Element;value;generated + public Stream distinct() { + return null; + } + + public static Stream empty() { + return null; + } + + // MaD=p;Stream;true;filter;(Predicate);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + // MaD=p;Stream;true;filter;(Predicate);;Argument[-1].Element;ReturnValue.Element;value;generated + public Stream filter(Predicate super T> predicate) { + return null; + } + + // MaD=p;Stream;true;findAny;();;Argument[-1].Element;ReturnValue.SyntheticField[ArgType0];value;generated + public Optional findAny() { + return null; + } + + // MaD=p;Stream;true;findFirst;();;Argument[-1].Element;ReturnValue.SyntheticField[ArgType0];value;generated + public Optional findFirst() { + return null; + } + + // MaD=p;Stream;true;flatMap;(Function);;Argument[0].ReturnValue.Element;ReturnValue.Element;value;generated + // MaD=p;Stream;true;flatMap;(Function);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + public Stream flatMap(Function super T, ? extends Stream extends R>> mapper) { + return null; + } + + // MaD=p;Stream;true;flatMapToDouble;(Function);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + public DoubleStream flatMapToDouble(Function super T, ? extends DoubleStream> mapper) { + return null; + } + + // MaD=p;Stream;true;flatMapToInt;(Function);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + public IntStream flatMapToInt(Function super T, ? extends IntStream> mapper) { + return null; + } + + // MaD=p;Stream;true;flatMapToLong;(Function);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + public LongStream flatMapToLong(Function super T, ? extends LongStream> mapper) { + return null; + } + + // MaD=p;Stream;true;forEach;(Consumer);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + public void forEach(Consumer super T> action) { + } + + // MaD=p;Stream;true;forEachOrdered;(Consumer);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + public void forEachOrdered(Consumer super T> action) { + } + + // MaD=p;Stream;true;generate;(Supplier);;Argument[0].ReturnValue;ReturnValue.Element;value;generated + public static Stream generate(Supplier s) { + return null; + } + + // MaD=p;Stream;true;iterate;(Object,UnaryOperator);;Argument[0];Argument[1].Parameter[0];value;generated + // MaD=p;Stream;true;iterate;(Object,UnaryOperator);;Argument[0];ReturnValue.Element;value;generated + // MaD=p;Stream;true;iterate;(Object,UnaryOperator);;Argument[1].ReturnValue;Argument[1].Parameter[0];value;generated + // MaD=p;Stream;true;iterate;(Object,UnaryOperator);;Argument[1].ReturnValue;ReturnValue.Element;value;generated + public static Stream iterate(T seed, UnaryOperator f) { + return null; + } + + // MaD=p;Stream;true;limit;(long);;Argument[-1].Element;ReturnValue.Element;value;generated + public Stream limit(long maxSize) { + return null; + } + + // MaD=p;Stream;true;map;(Function);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + // MaD=p;Stream;true;map;(Function);;Argument[0].ReturnValue;ReturnValue.Element;value;generated + public Stream map(Function super T, ? extends R> mapper) { + return null; + } + + // MaD=p;Stream;true;mapToDouble;(ToDoubleFunction);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + public DoubleStream mapToDouble(ToDoubleFunction super T> mapper) { + return null; + } + + // MaD=p;Stream;true;mapToInt;(ToIntFunction);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + public IntStream mapToInt(ToIntFunction super T> mapper) { + return null; + } + + // MaD=p;Stream;true;mapToLong;(ToLongFunction);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + public LongStream mapToLong(ToLongFunction super T> mapper) { + return null; + } + + // MaD=p;Stream;true;max;(Comparator);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + // MaD=p;Stream;true;max;(Comparator);;Argument[-1].Element;Argument[0].Parameter[1];value;generated + // MaD=p;Stream;true;max;(Comparator);;Argument[-1].Element;ReturnValue.SyntheticField[ArgType0];value;generated + public Optional max(Comparator super T> comparator) { + return null; + } + + // MaD=p;Stream;true;min;(Comparator);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + // MaD=p;Stream;true;min;(Comparator);;Argument[-1].Element;Argument[0].Parameter[1];value;generated + // MaD=p;Stream;true;min;(Comparator);;Argument[-1].Element;ReturnValue.SyntheticField[ArgType0];value;generated + public Optional min(Comparator super T> comparator) { + return null; + } + + // MaD=p;Stream;true;noneMatch;(Predicate);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + public boolean noneMatch(Predicate super T> predicate) { + return false; + } + + // MaD=p;Stream;true;of;(Object[]);;Argument[0].ArrayElement;ReturnValue.Element;value;generated + public static Stream of(T... t) { + return null; + } + + // MaD=p;Stream;true;of;(Object);;Argument[0];ReturnValue.Element;value;generated + public static Stream of(T t) { + return null; + } + + // MaD=p;Stream;true;peek;(Consumer);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + // MaD=p;Stream;true;peek;(Consumer);;Argument[-1].Element;ReturnValue.Element;value;generated + public Stream peek(Consumer super T> action) { + return null; + } + + // The generated models are only partially correct. + // MaD=p;Stream;true;reduce;(BinaryOperator);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + // MaD=p;Stream;true;reduce;(BinaryOperator);;Argument[-1].Element;Argument[0].Parameter[1];value;generated + // MaD=p;Stream;true;reduce;(BinaryOperator);;Argument[-1].Element;ReturnValue.SyntheticField[ArgType0];value;generated + // MaD=p;Stream;true;reduce;(BinaryOperator);;Argument[0].ReturnValue;Argument[0].Parameter[0];value;generated + // MaD=p;Stream;true;reduce;(BinaryOperator);;Argument[0].ReturnValue;Argument[0].Parameter[1];value;generated + // MaD=p;Stream;true;reduce;(BinaryOperator);;Argument[0].ReturnValue;ReturnValue.SyntheticField[ArgType0];value;generated + // SPURIOUS-MaD=p;Stream;true;reduce;(BinaryOperator);;Argument[0].ReturnValue;Argument[-1].Element;value;generated + public Optional reduce(BinaryOperator accumulator) { + return null; + } + + // The generated models are only partially correct. + // MaD=p;Stream;true;reduce;(Object,BinaryOperator);;Argument[-1].Element;Argument[1].Parameter[0];value;generated + // MaD=p;Stream;true;reduce;(Object,BinaryOperator);;Argument[-1].Element;Argument[1].Parameter[1];value;generated + // MaD=p;Stream;true;reduce;(Object,BinaryOperator);;Argument[0];Argument[1].Parameter[0];value;generated + // MaD=p;Stream;true;reduce;(Object,BinaryOperator);;Argument[0];Argument[1].Parameter[1];value;generated + // MaD=p;Stream;true;reduce;(Object,BinaryOperator);;Argument[0];ReturnValue;value;generated + // MaD=p;Stream;true;reduce;(Object,BinaryOperator);;Argument[1].ReturnValue;Argument[1].Parameter[0];value;generated + // MaD=p;Stream;true;reduce;(Object,BinaryOperator);;Argument[1].ReturnValue;Argument[1].Parameter[1];value;generated + // MaD=p;Stream;true;reduce;(Object,BinaryOperator);;Argument[1].ReturnValue;ReturnValue;value;generated + // SPURIOUS-MaD=p;Stream;true;reduce;(Object,BinaryOperator);;Argument[-1].Element;ReturnValue;value;generated + // SPURIOUS-MaD=p;Stream;true;reduce;(Object,BinaryOperator);;Argument[0];Argument[-1].Element;value;generated + // SPURIOUS-MaD=p;Stream;true;reduce;(Object,BinaryOperator);;Argument[1].ReturnValue;Argument[-1].Element;value;generated + public T reduce(T identity, BinaryOperator accumulator) { + return null; + } + + // MaD=p;Stream;true;reduce;(Object,BiFunction,BinaryOperator);;Argument[-1].Element;Argument[1].Parameter[1];value;generated + // MaD=p;Stream;true;reduce;(Object,BiFunction,BinaryOperator);;Argument[0];Argument[1].Parameter[0];value;generated + // MaD=p;Stream;true;reduce;(Object,BiFunction,BinaryOperator);;Argument[0];Argument[2].Parameter[0];value;generated + // MaD=p;Stream;true;reduce;(Object,BiFunction,BinaryOperator);;Argument[0];Argument[2].Parameter[1];value;generated + // MaD=p;Stream;true;reduce;(Object,BiFunction,BinaryOperator);;Argument[0];ReturnValue;value;generated + // MaD=p;Stream;true;reduce;(Object,BiFunction,BinaryOperator);;Argument[1].ReturnValue;Argument[1].Parameter[0];value;generated + // MaD=p;Stream;true;reduce;(Object,BiFunction,BinaryOperator);;Argument[1].ReturnValue;Argument[2].Parameter[0];value;generated + // MaD=p;Stream;true;reduce;(Object,BiFunction,BinaryOperator);;Argument[1].ReturnValue;Argument[2].Parameter[1];value;generated + // MaD=p;Stream;true;reduce;(Object,BiFunction,BinaryOperator);;Argument[1].ReturnValue;ReturnValue;value;generated + // MaD=p;Stream;true;reduce;(Object,BiFunction,BinaryOperator);;Argument[2].ReturnValue;Argument[1].Parameter[0];value;generated + // MaD=p;Stream;true;reduce;(Object,BiFunction,BinaryOperator);;Argument[2].ReturnValue;Argument[2].Parameter[0];value;generated + // MaD=p;Stream;true;reduce;(Object,BiFunction,BinaryOperator);;Argument[2].ReturnValue;Argument[2].Parameter[1];value;generated + // MaD=p;Stream;true;reduce;(Object,BiFunction,BinaryOperator);;Argument[2].ReturnValue;ReturnValue;value;generated + public U reduce(U identity, BiFunction accumulator, BinaryOperator combiner) { + return null; + } + + // MaD=p;Stream;true;skip;(long);;Argument[-1].Element;ReturnValue.Element;value;generated + public Stream skip(long n) { + return null; + } + + // MaD=p;Stream;true;sorted;();;Argument[-1].Element;ReturnValue.Element;value;generated + public Stream sorted() { + return null; + } + + // MaD=p;Stream;true;sorted;(Comparator);;Argument[-1].Element;Argument[0].Parameter[0];value;generated + // MaD=p;Stream;true;sorted;(Comparator);;Argument[-1].Element;Argument[0].Parameter[1];value;generated + // MaD=p;Stream;true;sorted;(Comparator);;Argument[-1].Element;ReturnValue.Element;value;generated + public Stream sorted(Comparator super T> comparator) { + return null; + } + + // Models can never be generated correctly based on the type information + // as it involves downcasting. + public Object[] toArray() { + return null; + } + + // The generated result is only partially correct as there is no mentioning of + // the type T in the method definition. + // MaD=p;Stream;true;toArray;(IntFunction);;Argument[0].ReturnValue.ArrayElement;ReturnValue.ArrayElement;value;generated + public A[] toArray(IntFunction generator) { + return null; + } +} \ No newline at end of file diff --git a/java/ql/test/utils/model-generator/typebasedflow/p/TypeBasedCollection.java b/java/ql/test/utils/model-generator/typebasedflow/p/TypeBasedCollection.java new file mode 100644 index 00000000000..6cd7dcdc5af --- /dev/null +++ b/java/ql/test/utils/model-generator/typebasedflow/p/TypeBasedCollection.java @@ -0,0 +1,25 @@ +package p; + +import java.util.List; +import java.util.ArrayList; + +public class TypeBasedCollection extends ArrayList { + + // MaD=p;TypeBasedCollection;true;addT;(Object);;Argument[0];Argument[-1].Element;value;generated + public void addT(T x) { + } + + // MaD=p;TypeBasedCollection;true;addManyT;(List);;Argument[0].Element;Argument[-1].Element;value;generated + public void addManyT(List xs) { + } + + // MaD=p;TypeBasedCollection;true;firstT;();;Argument[-1].Element;ReturnValue;value;generated + public T firstT() { + return null; + } + + // MaD=p;TypeBasedCollection;true;getManyT;();;Argument[-1].Element;ReturnValue.Element;value;generated + public List getManyT() { + return null; + } +} \ No newline at end of file diff --git a/java/ql/test/utils/model-generator/typebasedflow/p/TypeBasedComplex.java b/java/ql/test/utils/model-generator/typebasedflow/p/TypeBasedComplex.java new file mode 100644 index 00000000000..127f319b49b --- /dev/null +++ b/java/ql/test/utils/model-generator/typebasedflow/p/TypeBasedComplex.java @@ -0,0 +1,91 @@ +package p; + +import java.util.List; +import java.util.function.Function; + +public class TypeBasedComplex { + + // MaD=p;TypeBasedComplex;true;addMany;(List);;Argument[0].Element;Argument[-1].SyntheticField[ArgType0];value;generated + public void addMany(List xs) { + } + + // MaD=p;TypeBasedComplex;true;getMany;();;Argument[-1].SyntheticField[ArgType0];ReturnValue.Element;value;generated + public List getMany() { + return null; + } + + // MaD=p;TypeBasedComplex;true;apply;(Function);;Argument[-1].SyntheticField[ArgType0];Argument[0].Parameter[0];value;generated + public Integer apply(Function f) { + return null; + } + + // A method that doesn't mention `T` in its type signature. + // This is for testing that we don't generate a summary that involves the + // implicit field for `T`. + // MaD=p;TypeBasedComplex;true;apply2;(Object,Function);;Argument[0];Argument[1].Parameter[0];value;generated + // MaD=p;TypeBasedComplex;true;apply2;(Object,Function);;Argument[1].ReturnValue;ReturnValue;value;generated + public T2 apply2(T1 x, Function f) { + return null; + } + + // MaD=p;TypeBasedComplex;true;flatMap;(Function);;Argument[-1].SyntheticField[ArgType0];Argument[0].Parameter[0];value;generated + // MaD=p;TypeBasedComplex;true;flatMap;(Function);;Argument[-1].SyntheticField[ArgType0];ReturnValue.SyntheticField[ArgType0];value;generated + // MaD=p;TypeBasedComplex;true;flatMap;(Function);;Argument[0].ReturnValue.Element;Argument[-1].SyntheticField[ArgType0];value;generated + // MaD=p;TypeBasedComplex;true;flatMap;(Function);;Argument[0].ReturnValue.Element;Argument[0].Parameter[0];value;generated + // MaD=p;TypeBasedComplex;true;flatMap;(Function);;Argument[0].ReturnValue.Element;ReturnValue.SyntheticField[ArgType0];value;generated + public TypeBasedComplex flatMap(Function > f) { + return null; + } + + // MaD=p;TypeBasedComplex;true;flatMap2;(Function);;Argument[-1].SyntheticField[ArgType0];Argument[0].Parameter[0];value;generated + // MaD=p;TypeBasedComplex;true;flatMap2;(Function);;Argument[0].ReturnValue.Element;ReturnValue.SyntheticField[ArgType0];value;generated + public TypeBasedComplexflatMap2(Function> f) { + return null; + } + + // MaD=p;TypeBasedComplex;true;map;(Function);;Argument[-1].SyntheticField[ArgType0];Argument[0].Parameter[0];value;generated + // MaD=p;TypeBasedComplex;true;map;(Function);;Argument[0].ReturnValue;ReturnValue;value;generated + public S map(Functionf) { + return null; + } + + // MaD=p;TypeBasedComplex;true;mapComplex;(Function);;Argument[-1].SyntheticField[ArgType0];Argument[0].Parameter[0];value;generated + // MaD=p;TypeBasedComplex;true;mapComplex;(Function);;Argument[0].ReturnValue;ReturnValue.SyntheticField[ArgType0];value;generated + public TypeBasedComplexmapComplex(Functionf) { + return null; + } + + // MaD=p;TypeBasedComplex;true;returnComplex;(Function);;Argument[-1].SyntheticField[ArgType0];Argument[0].Parameter[0];value;generated + // MaD=p;TypeBasedComplex;true;returnComplex;(Function);;Argument[-1].SyntheticField[ArgType0];ReturnValue.SyntheticField[ArgType0];value;generated + // MaD=p;TypeBasedComplex;true;returnComplex;(Function);;Argument[0].ReturnValue.SyntheticField[ArgType0];Argument[-1].SyntheticField[ArgType0];value;generated + // MaD=p;TypeBasedComplex;true;returnComplex;(Function);;Argument[0].ReturnValue.SyntheticField[ArgType0];Argument[0].Parameter[0];value;generated + // MaD=p;TypeBasedComplex;true;returnComplex;(Function);;Argument[0].ReturnValue.SyntheticField[ArgType0];ReturnValue.SyntheticField[ArgType0];value;generated + public TypeBasedComplex returnComplex(Function > f) { + return null; + } + + // MaD=p;TypeBasedComplex;true;set;(Integer,Function);;Argument[1].ReturnValue;Argument[-1].SyntheticField[ArgType0];value;generated + public void set(Integer x, Function f) { + } + + // MaD=p;TypeBasedComplex;true;applyMyFunction;(MyFunction,Integer);;Argument[-1].SyntheticField[ArgType0];Argument[0].Parameter[0];value;generated + // MaD=p;TypeBasedComplex;true;applyMyFunction;(MyFunction,Integer);;Argument[0].ReturnValue;Argument[-1].SyntheticField[ArgType0];value;generated + // MaD=p;TypeBasedComplex;true;applyMyFunction;(MyFunction,Integer);;Argument[0].ReturnValue;Argument[0].Parameter[0];value;generated + public Integer applyMyFunction(MyFunction f, Integer x) { + return null; + } + + // MaD=p;TypeBasedComplex;true;applyMyFunctionGeneric;(MyFunction,Object);;Argument[-1].SyntheticField[ArgType0];Argument[0].Parameter[0];value;generated + // MaD=p;TypeBasedComplex;true;applyMyFunctionGeneric;(MyFunction,Object);;Argument[0].ReturnValue;ReturnValue;value;generated + // MaD=p;TypeBasedComplex;true;applyMyFunctionGeneric;(MyFunction,Object);;Argument[1];Argument[0].Parameter[1];value;generated + public S2 applyMyFunctionGeneric(MyFunction f, S1 x) { + return null; + } + + // MaD=p;TypeBasedComplex;true;applyMyFunctionGeneric;(MyFunction,Object,Object);;Argument[0].ReturnValue;ReturnValue;value;generated + // MaD=p;TypeBasedComplex;true;applyMyFunctionGeneric;(MyFunction,Object,Object);;Argument[1];Argument[0].Parameter[0];value;generated + // MaD=p;TypeBasedComplex;true;applyMyFunctionGeneric;(MyFunction,Object,Object);;Argument[2];Argument[0].Parameter[1];value;generated + public S3 applyMyFunctionGeneric(MyFunction f, S1 x, S2 y) { + return null; + } +} \ No newline at end of file diff --git a/java/ql/test/utils/model-generator/typebasedflow/p/TypeBasedSimple.java b/java/ql/test/utils/model-generator/typebasedflow/p/TypeBasedSimple.java new file mode 100644 index 00000000000..59fe6e48f13 --- /dev/null +++ b/java/ql/test/utils/model-generator/typebasedflow/p/TypeBasedSimple.java @@ -0,0 +1,42 @@ +package p; + +public class TypeBasedSimple { + + // MaD=p;TypeBasedSimple;true;TypeBasedSimple;(Object);;Argument[0];Argument[-1].SyntheticField[ArgType0];value;generated + public TypeBasedSimple(T t) { + } + + // MaD=p;TypeBasedSimple;true;get;();;Argument[-1].SyntheticField[ArgType0];ReturnValue;value;generated + public T get() { + return null; + } + + // MaD=p;TypeBasedSimple;true;get;(Object);;Argument[-1].SyntheticField[ArgType0];ReturnValue;value;generated + public T get(Object o) { + return null; + } + + // MaD=p;TypeBasedSimple;true;id;(Object);;Argument[-1].SyntheticField[ArgType0];ReturnValue;value;generated + // MaD=p;TypeBasedSimple;true;id;(Object);;Argument[0];Argument[-1].SyntheticField[ArgType0];value;generated + // MaD=p;TypeBasedSimple;true;id;(Object);;Argument[0];ReturnValue;value;generated + public T id(T x) { + return null; + } + + // MaD=p;TypeBasedSimple;true;id2;(Object);;Argument[0];ReturnValue;value;generated + public S id2(S x) { + return null; + } + + // MaD=p;TypeBasedSimple;true;set;(Object);;Argument[0];Argument[-1].SyntheticField[ArgType0];value;generated + public void set(T x) { + } + + // MaD=p;TypeBasedSimple;true;set;(int,Object);;Argument[1];Argument[-1].SyntheticField[ArgType0];value;generated + public void set(int x, T y) { + } + + // No summary as S is unrelated to T + publicvoid set2(S x) { + } +} \ No newline at end of file diff --git a/javascript/extractor/src/com/semmle/js/extractor/Main.java b/javascript/extractor/src/com/semmle/js/extractor/Main.java index a15072f2017..f03702ac10d 100644 --- a/javascript/extractor/src/com/semmle/js/extractor/Main.java +++ b/javascript/extractor/src/com/semmle/js/extractor/Main.java @@ -41,7 +41,7 @@ public class Main { * A version identifier that should be updated every time the extractor changes in such a way that * it may produce different tuples for the same file under the same {@link ExtractorConfig}. */ - public static final String EXTRACTOR_VERSION = "2022-11-10"; + public static final String EXTRACTOR_VERSION = "2022-11-15"; public static final Pattern NEWLINE = Pattern.compile("\n"); diff --git a/javascript/extractor/src/com/semmle/js/extractor/TypeExprKinds.java b/javascript/extractor/src/com/semmle/js/extractor/TypeExprKinds.java index a1c7b219a8a..82d4e4319c8 100644 --- a/javascript/extractor/src/com/semmle/js/extractor/TypeExprKinds.java +++ b/javascript/extractor/src/com/semmle/js/extractor/TypeExprKinds.java @@ -10,6 +10,7 @@ import com.semmle.js.ast.TemplateElement; import com.semmle.js.extractor.ASTExtractor.IdContext; import com.semmle.ts.ast.ArrayTypeExpr; import com.semmle.ts.ast.ConditionalTypeExpr; +import com.semmle.js.ast.DynamicImport; import com.semmle.ts.ast.FunctionTypeExpr; import com.semmle.ts.ast.GenericTypeExpr; import com.semmle.ts.ast.ImportTypeExpr; @@ -221,8 +222,7 @@ public class TypeExprKinds { return inferTypeExpr; } - @Override - public Integer visit(ImportTypeExpr nd, Void c) { + private Integer handleInlineImport() { switch (idcontext) { case NAMESPACE_BIND: return importNamespaceAccess; @@ -235,6 +235,17 @@ public class TypeExprKinds { } } + @Override + public Integer visit(ImportTypeExpr nd, Void c) { + return handleInlineImport(); + } + + @Override + public Integer visit(DynamicImport nd, Void c) { + // These may appear in interface 'extend' clauses + return handleInlineImport(); + } + @Override public Integer visit(OptionalTypeExpr nd, Void c) { return optionalTypeExpr; diff --git a/javascript/extractor/tests/ts/input/dynamic-type.ts b/javascript/extractor/tests/ts/input/dynamic-type.ts new file mode 100644 index 00000000000..2b2f90337a2 --- /dev/null +++ b/javascript/extractor/tests/ts/input/dynamic-type.ts @@ -0,0 +1 @@ +interface Foo extends import("foo").Bar {} diff --git a/javascript/extractor/tests/ts/output/trap/dynamic-type.ts.trap b/javascript/extractor/tests/ts/output/trap/dynamic-type.ts.trap new file mode 100644 index 00000000000..71410d093cc --- /dev/null +++ b/javascript/extractor/tests/ts/output/trap/dynamic-type.ts.trap @@ -0,0 +1,139 @@ +#10000=@"/dynamic-type.ts;sourcefile" +files(#10000,"/dynamic-type.ts") +#10001=@"/;folder" +folders(#10001,"/") +containerparent(#10001,#10000) +#10002=@"loc,{#10000},0,0,0,0" +locations_default(#10002,#10000,0,0,0,0) +hasLocation(#10000,#10002) +#20000=@"global_scope" +scopes(#20000,0) +#20001=@"script;{#10000},1,1" +#20002=* +lines(#20002,#20001,"interface Foo extends import(""foo"").Bar {}"," +") +#20003=@"loc,{#10000},1,1,1,42" +locations_default(#20003,#10000,1,1,1,42) +hasLocation(#20002,#20003) +numlines(#20001,1,1,0) +#20004=* +tokeninfo(#20004,7,#20001,0,"interface") +#20005=@"loc,{#10000},1,1,1,9" +locations_default(#20005,#10000,1,1,1,9) +hasLocation(#20004,#20005) +#20006=* +tokeninfo(#20006,6,#20001,1,"Foo") +#20007=@"loc,{#10000},1,11,1,13" +locations_default(#20007,#10000,1,11,1,13) +hasLocation(#20006,#20007) +#20008=* +tokeninfo(#20008,7,#20001,2,"extends") +#20009=@"loc,{#10000},1,15,1,21" +locations_default(#20009,#10000,1,15,1,21) +hasLocation(#20008,#20009) +#20010=* +tokeninfo(#20010,7,#20001,3,"import") +#20011=@"loc,{#10000},1,23,1,28" +locations_default(#20011,#10000,1,23,1,28) +hasLocation(#20010,#20011) +#20012=* +tokeninfo(#20012,8,#20001,4,"(") +#20013=@"loc,{#10000},1,29,1,29" +locations_default(#20013,#10000,1,29,1,29) +hasLocation(#20012,#20013) +#20014=* +tokeninfo(#20014,4,#20001,5,"""foo""") +#20015=@"loc,{#10000},1,30,1,34" +locations_default(#20015,#10000,1,30,1,34) +hasLocation(#20014,#20015) +#20016=* +tokeninfo(#20016,8,#20001,6,")") +#20017=@"loc,{#10000},1,35,1,35" +locations_default(#20017,#10000,1,35,1,35) +hasLocation(#20016,#20017) +#20018=* +tokeninfo(#20018,8,#20001,7,".") +#20019=@"loc,{#10000},1,36,1,36" +locations_default(#20019,#10000,1,36,1,36) +hasLocation(#20018,#20019) +#20020=* +tokeninfo(#20020,6,#20001,8,"Bar") +#20021=@"loc,{#10000},1,37,1,39" +locations_default(#20021,#10000,1,37,1,39) +hasLocation(#20020,#20021) +#20022=* +tokeninfo(#20022,8,#20001,9,"{") +#20023=@"loc,{#10000},1,41,1,41" +locations_default(#20023,#10000,1,41,1,41) +hasLocation(#20022,#20023) +#20024=* +tokeninfo(#20024,8,#20001,10,"}") +#20025=@"loc,{#10000},1,42,1,42" +locations_default(#20025,#10000,1,42,1,42) +hasLocation(#20024,#20025) +#20026=* +tokeninfo(#20026,0,#20001,11,"") +#20027=@"loc,{#10000},2,1,2,0" +locations_default(#20027,#10000,2,1,2,0) +hasLocation(#20026,#20027) +toplevels(#20001,0) +#20028=@"loc,{#10000},1,1,2,0" +locations_default(#20028,#10000,1,1,2,0) +hasLocation(#20001,#20028) +#20029=@"local_type_name;{Foo};{#20000}" +local_type_names(#20029,"Foo",#20000) +#20030=* +stmts(#20030,34,#20001,0,"interfa ... .Bar {}") +hasLocation(#20030,#20003) +stmt_containers(#20030,#20001) +#20031=* +typeexprs(#20031,13,#20030,-1,"import(""foo"").Bar") +#20032=@"loc,{#10000},1,23,1,39" +locations_default(#20032,#10000,1,23,1,39) +hasLocation(#20031,#20032) +enclosing_stmt(#20031,#20030) +expr_containers(#20031,#20001) +#20033=* +typeexprs(#20033,31,#20031,0,"import(""foo"")") +#20034=@"loc,{#10000},1,23,1,35" +locations_default(#20034,#10000,1,23,1,35) +hasLocation(#20033,#20034) +enclosing_stmt(#20033,#20030) +expr_containers(#20033,#20001) +#20035=* +exprs(#20035,4,#20033,0,"""foo""") +hasLocation(#20035,#20015) +enclosing_stmt(#20035,#20030) +expr_containers(#20035,#20001) +literals("foo","""foo""",#20035) +#20036=* +regexpterm(#20036,14,#20035,0,"foo") +#20037=@"loc,{#10000},1,31,1,33" +locations_default(#20037,#10000,1,31,1,33) +hasLocation(#20036,#20037) +regexp_const_value(#20036,"foo") +#20038=* +typeexprs(#20038,15,#20031,1,"Bar") +hasLocation(#20038,#20021) +enclosing_stmt(#20038,#20030) +expr_containers(#20038,#20001) +literals("Bar","Bar",#20038) +#20039=* +typeexprs(#20039,1,#20030,0,"Foo") +hasLocation(#20039,#20007) +enclosing_stmt(#20039,#20030) +expr_containers(#20039,#20001) +literals("Foo","Foo",#20039) +typedecl(#20039,#20029) +#20040=* +entry_cfg_node(#20040,#20001) +#20041=@"loc,{#10000},1,1,1,0" +locations_default(#20041,#10000,1,1,1,0) +hasLocation(#20040,#20041) +#20042=* +exit_cfg_node(#20042,#20001) +hasLocation(#20042,#20027) +successor(#20030,#20042) +successor(#20040,#20030) +numlines(#10000,1,1,0) +filetype(#10000,"typescript") diff --git a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/ATMConfig.qll b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/ATMConfig.qll index ba555b0de5b..55d75ad2e4d 100644 --- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/ATMConfig.qll +++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/ATMConfig.qll @@ -50,7 +50,8 @@ abstract class AtmConfig extends string { // known sink for the class. exists(EndpointCharacteristic characteristic | characteristic.getEndpoints(sink) and - characteristic.getImplications(this.getASinkEndpointType(), true, 1.0) + characteristic + .getImplications(this.getASinkEndpointType(), true, characteristic.maximalConfidence()) ) } diff --git a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll index adc98a5c08c..e1539a504ec 100644 --- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll +++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll @@ -30,18 +30,36 @@ abstract class EndpointCharacteristic extends string { /** * This predicate describes what the characteristic tells us about an endpoint. * - * Params: - * endpointClass: Class 0 is the negative class. Each positive int corresponds to a single sink type. - * isPositiveIndicator: Does this characteristic indicate this endpoint _is_ a member of the class, or that it - * _isn't_ a member of the class? - * confidence: A number in [0, 1], which tells us how strong an indicator this characteristic is for the endpoint - * belonging / not belonging to the given class. + * Params: + * endpointClass: The sink type. Each EndpointType has a predicate getEncoding, which specifies the classifier + * class for this sink type. Class 0 is the negative class (non-sink). Each positive int corresponds to a single + * sink type. + * isPositiveIndicator: If true, this characteristic indicates that this endpoint _is_ a member of the class; if + * false, it indicates that it _isn't_ a member of the class. + * confidence: A float in [0, 1], which tells us how strong an indicator this characteristic is for the endpoint + * belonging / not belonging to the given class. A confidence near zero means this characteristic is a very weak + * indicator of whether or not the endpoint belongs to the class. A confidence of 1 means that all endpoints with + * this characteristic definitively do/don't belong to the class. */ abstract predicate getImplications( EndpointType endpointClass, boolean isPositiveIndicator, float confidence ); + + // The following are some confidence values that are used in practice by the subclasses. They are defined as named + // constants here to make it easier to change them in the future. + final float maximalConfidence() { result = 1.0 } + + final float highConfidence() { result = 0.9 } + + final float mediumConfidence() { result = 0.6 } } +/* + * Characteristics that are indicative of a sink. + * NOTE: Initially each sink type has only one characteristic, which is that it's a sink of this type in the standard + * JavaScript libraries. + */ + /** * Endpoints identified as "DomBasedXssSink" by the standard JavaScript libraries are XSS sinks with maximal confidence. */ @@ -53,7 +71,9 @@ private class DomBasedXssSinkCharacteristic extends EndpointCharacteristic { override predicate getImplications( EndpointType endpointClass, boolean isPositiveIndicator, float confidence ) { - endpointClass instanceof XssSinkType and isPositiveIndicator = true and confidence = 1.0 + endpointClass instanceof XssSinkType and + isPositiveIndicator = true and + confidence = maximalConfidence() } } @@ -69,7 +89,9 @@ private class TaintedPathSinkCharacteristic extends EndpointCharacteristic { override predicate getImplications( EndpointType endpointClass, boolean isPositiveIndicator, float confidence ) { - endpointClass instanceof TaintedPathSinkType and isPositiveIndicator = true and confidence = 1.0 + endpointClass instanceof TaintedPathSinkType and + isPositiveIndicator = true and + confidence = maximalConfidence() } } @@ -87,7 +109,7 @@ private class SqlInjectionSinkCharacteristic extends EndpointCharacteristic { ) { endpointClass instanceof SqlInjectionSinkType and isPositiveIndicator = true and - confidence = 1.0 + confidence = maximalConfidence() } } @@ -105,6 +127,315 @@ private class NosqlInjectionSinkCharacteristic extends EndpointCharacteristic { ) { endpointClass instanceof NosqlInjectionSinkType and isPositiveIndicator = true and - confidence = 1.0 + confidence = maximalConfidence() + } +} + +/* + * Characteristics that are indicative of not being a sink of any type. + */ + +/** + * A characteristic that is an indicator of not being a sink of any type, because it's an argument to a function of a + * builtin object. + */ +abstract private class ArgumentToBuiltinFunctionCharacteristic extends EndpointCharacteristic { + bindingset[this] + ArgumentToBuiltinFunctionCharacteristic() { any() } +} + +/** + * A high-confidence characteristic that indicates that an endpoint is not a sink of any type. + */ +abstract private class NotASinkCharacteristic extends EndpointCharacteristic { + bindingset[this] + NotASinkCharacteristic() { any() } + + override predicate getImplications( + EndpointType endpointClass, boolean isPositiveIndicator, float confidence + ) { + endpointClass instanceof NegativeType and + isPositiveIndicator = true and + confidence = highConfidence() + } +} + +/** + * A medium-confidence characteristic that indicates that an endpoint is not a sink of any type. + * + * TODO: This class is currently not private, because the current extraction logic explicitly avoids including these + * endpoints in the training data. We might want to change this in the future. + */ +abstract class LikelyNotASinkCharacteristic extends EndpointCharacteristic { + bindingset[this] + LikelyNotASinkCharacteristic() { any() } + + override predicate getImplications( + EndpointType endpointClass, boolean isPositiveIndicator, float confidence + ) { + endpointClass instanceof NegativeType and + isPositiveIndicator = true and + confidence = mediumConfidence() + } +} + +private class LodashUnderscore extends NotASinkCharacteristic { + LodashUnderscore() { this = "LodashUnderscoreArgument" } + + override predicate getEndpoints(DataFlow::Node n) { + any(LodashUnderscore::Member m).getACall().getAnArgument() = n + } +} + +private class JQueryArgumentCharacteristic extends NotASinkCharacteristic { + JQueryArgumentCharacteristic() { this = "JQueryArgument" } + + override predicate getEndpoints(DataFlow::Node n) { + any(JQuery::MethodCall m).getAnArgument() = n + } +} + +private class ClientRequestCharacteristic extends NotASinkCharacteristic { + ClientRequestCharacteristic() { this = "ClientRequest" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(ClientRequest r | + r.getAnArgument() = n or n = r.getUrl() or n = r.getHost() or n = r.getADataNode() + ) + } +} + +private class PromiseDefinitionCharacteristic extends NotASinkCharacteristic { + PromiseDefinitionCharacteristic() { this = "PromiseDefinition" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(PromiseDefinition p | + n = [p.getResolveParameter(), p.getRejectParameter()].getACall().getAnArgument() + ) + } +} + +private class CryptographicKeyCharacteristic extends NotASinkCharacteristic { + CryptographicKeyCharacteristic() { this = "CryptographicKey" } + + override predicate getEndpoints(DataFlow::Node n) { n instanceof CryptographicKey } +} + +private class CryptographicOperationFlowCharacteristic extends NotASinkCharacteristic { + CryptographicOperationFlowCharacteristic() { this = "CryptographicOperationFlow" } + + override predicate getEndpoints(DataFlow::Node n) { + any(CryptographicOperation op).getInput() = n + } +} + +private class LoggerMethodCharacteristic extends NotASinkCharacteristic { + LoggerMethodCharacteristic() { this = "LoggerMethod" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::CallNode call | n = call.getAnArgument() | + call.getCalleeName() = getAStandardLoggerMethodName() + ) + } +} + +private class TimeoutCharacteristic extends NotASinkCharacteristic { + TimeoutCharacteristic() { this = "Timeout" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::CallNode call | n = call.getAnArgument() | + call.getCalleeName() = ["setTimeout", "clearTimeout"] + ) + } +} + +private class ReceiverStorageCharacteristic extends NotASinkCharacteristic { + ReceiverStorageCharacteristic() { this = "ReceiverStorage" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::CallNode call | n = call.getAnArgument() | + call.getReceiver() = DataFlow::globalVarRef(["localStorage", "sessionStorage"]) + ) + } +} + +private class StringStartsWithCharacteristic extends NotASinkCharacteristic { + StringStartsWithCharacteristic() { this = "StringStartsWith" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::CallNode call | n = call.getAnArgument() | + call instanceof StringOps::StartsWith + ) + } +} + +private class StringEndsWithCharacteristic extends NotASinkCharacteristic { + StringEndsWithCharacteristic() { this = "StringEndsWith" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::CallNode call | n = call.getAnArgument() | call instanceof StringOps::EndsWith) + } +} + +private class StringRegExpTestCharacteristic extends NotASinkCharacteristic { + StringRegExpTestCharacteristic() { this = "StringRegExpTest" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::CallNode call | n = call.getAnArgument() | + call instanceof StringOps::RegExpTest + ) + } +} + +private class EventRegistrationCharacteristic extends NotASinkCharacteristic { + EventRegistrationCharacteristic() { this = "EventRegistration" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::CallNode call | n = call.getAnArgument() | call instanceof EventRegistration) + } +} + +private class EventDispatchCharacteristic extends NotASinkCharacteristic { + EventDispatchCharacteristic() { this = "EventDispatch" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::CallNode call | n = call.getAnArgument() | call instanceof EventDispatch) + } +} + +private class MembershipCandidateTestCharacteristic extends NotASinkCharacteristic { + MembershipCandidateTestCharacteristic() { this = "MembershipCandidateTest" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::CallNode call | n = call.getAnArgument() | + call = any(MembershipCandidate c).getTest() + ) + } +} + +private class FileSystemAccessCharacteristic extends NotASinkCharacteristic { + FileSystemAccessCharacteristic() { this = "FileSystemAccess" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::CallNode call | n = call.getAnArgument() | call instanceof FileSystemAccess) + } +} + +private class DatabaseAccessCharacteristic extends NotASinkCharacteristic { + DatabaseAccessCharacteristic() { this = "DatabaseAccess" } + + override predicate getEndpoints(DataFlow::Node n) { + // TODO database accesses are less well defined than database query sinks, so this may cover unmodeled sinks on + // existing database models + exists(DataFlow::CallNode call | n = call.getAnArgument() | + [ + call, call.getAMethodCall() + /* command pattern where the query is built, and then exec'ed later */ ] instanceof + DatabaseAccess + ) + } +} + +private class DomCharacteristic extends NotASinkCharacteristic { + DomCharacteristic() { this = "DOM" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::CallNode call | n = call.getAnArgument() | call = DOM::domValueRef()) + } +} + +private class NextFunctionCallCharacteristic extends NotASinkCharacteristic { + NextFunctionCallCharacteristic() { this = "NextFunctionCall" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::CallNode call | n = call.getAnArgument() | + call.getCalleeName() = "next" and + exists(DataFlow::FunctionNode f | call = f.getLastParameter().getACall()) + ) + } +} + +private class DojoRequireCharacteristic extends NotASinkCharacteristic { + DojoRequireCharacteristic() { this = "DojoRequire" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::CallNode call | n = call.getAnArgument() | + call = DataFlow::globalVarRef("dojo").getAPropertyRead("require").getACall() + ) + } +} + +private class Base64ManipulationCharacteristic extends NotASinkCharacteristic { + Base64ManipulationCharacteristic() { this = "Base64Manipulation" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(Base64::Decode d | n = d.getInput()) or + exists(Base64::Encode d | n = d.getInput()) + } +} + +private class ArgumentToArrayCharacteristic extends ArgumentToBuiltinFunctionCharacteristic, + LikelyNotASinkCharacteristic { + ArgumentToArrayCharacteristic() { this = "ArgumentToArray" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::SourceNode builtin, DataFlow::SourceNode receiver, DataFlow::InvokeNode invk | + builtin instanceof DataFlow::ArrayCreationNode + | + receiver = [builtin.getAnInvocation(), builtin] and + invk = [receiver, receiver.getAPropertyRead()].getAnInvocation() and + invk.getAnArgument() = n + ) + } +} + +private class ArgumentToBuiltinGlobalVarRefCharacteristic extends ArgumentToBuiltinFunctionCharacteristic, + LikelyNotASinkCharacteristic { + ArgumentToBuiltinGlobalVarRefCharacteristic() { this = "ArgumentToBuiltinGlobalVarRef" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::SourceNode builtin, DataFlow::SourceNode receiver, DataFlow::InvokeNode invk | + builtin = + DataFlow::globalVarRef([ + "Map", "Set", "WeakMap", "WeakSet", "Number", "Object", "String", "Array", "Error", + "Math", "Boolean" + ]) + | + receiver = [builtin.getAnInvocation(), builtin] and + invk = [receiver, receiver.getAPropertyRead()].getAnInvocation() and + invk.getAnArgument() = n + ) + } +} + +private class ConstantReceiverCharacteristic extends ArgumentToBuiltinFunctionCharacteristic, + NotASinkCharacteristic { + ConstantReceiverCharacteristic() { this = "ConstantReceiver" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(Expr primitive, MethodCallExpr c | + primitive instanceof ConstantString or + primitive instanceof NumberLiteral or + primitive instanceof BooleanLiteral + | + c.calls(primitive, _) and + c.getAnArgument() = n.asExpr() + ) + } +} + +private class BuiltinCallNameCharacteristic extends ArgumentToBuiltinFunctionCharacteristic, + NotASinkCharacteristic { + BuiltinCallNameCharacteristic() { this = "BuiltinCallName" } + + override predicate getEndpoints(DataFlow::Node n) { + exists(DataFlow::CallNode call | + call.getAnArgument() = n and + call.getCalleeName() = + [ + "indexOf", "hasOwnProperty", "substring", "isDecimal", "decode", "encode", "keys", + "shift", "values", "forEach", "toString", "slice", "splice", "push", "isArray", "sort" + ] + ) } } diff --git a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointTypes.qll b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointTypes.qll index aa625b12862..d2cc37b1b33 100644 --- a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointTypes.qll +++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointTypes.qll @@ -16,6 +16,11 @@ newtype TEndpointType = abstract class EndpointType extends TEndpointType { abstract string getDescription(); + /** + * Gets the integer representation of this endpoint type. This integer representation specifies the class number + * used by the endpoint scoring model (the classifier) to represent this endpoint type. Class 0 is the negative + * class (non-sink). Each positive int corresponds to a single sink type. + */ abstract int getEncoding(); string toString() { result = getDescription() } diff --git a/javascript/ql/experimental/adaptivethreatmodeling/lib/qlpack.yml b/javascript/ql/experimental/adaptivethreatmodeling/lib/qlpack.yml index e0571f38255..fb53f54ded7 100644 --- a/javascript/ql/experimental/adaptivethreatmodeling/lib/qlpack.yml +++ b/javascript/ql/experimental/adaptivethreatmodeling/lib/qlpack.yml @@ -1,5 +1,5 @@ name: codeql/javascript-experimental-atm-lib -version: 0.4.1 +version: 0.4.2 extractor: javascript library: true groups: diff --git a/javascript/ql/experimental/adaptivethreatmodeling/src/qlpack.yml b/javascript/ql/experimental/adaptivethreatmodeling/src/qlpack.yml index cab87ce0e33..725beadcb0e 100644 --- a/javascript/ql/experimental/adaptivethreatmodeling/src/qlpack.yml +++ b/javascript/ql/experimental/adaptivethreatmodeling/src/qlpack.yml @@ -1,6 +1,6 @@ name: codeql/javascript-experimental-atm-queries language: javascript -version: 0.4.1 +version: 0.4.2 suites: codeql-suites defaultSuiteFile: codeql-suites/javascript-atm-code-scanning.qls groups: diff --git a/javascript/ql/lib/semmle/javascript/security/internal/SensitiveDataHeuristics.qll b/javascript/ql/lib/semmle/javascript/security/internal/SensitiveDataHeuristics.qll index f49696c5bad..7bc61ee2aee 100644 --- a/javascript/ql/lib/semmle/javascript/security/internal/SensitiveDataHeuristics.qll +++ b/javascript/ql/lib/semmle/javascript/security/internal/SensitiveDataHeuristics.qll @@ -103,7 +103,7 @@ module HeuristicNames { */ string notSensitiveRegexp() { result = - "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((? 0 and - result.getRegex() = re and - exists(int part_start | - part_start = this.getChild(i - 1).getEnd() + 1 // allow for the | - | - result.getStart() = part_start and - re.alternationOption(start, end, part_start, result.getEnd()) - ) - } - - override string getPrimaryQLClass() { result = "RegExpAlt" } -} - -class RegExpCharEscape = RegExpEscape; - -/** - * An escaped regular expression term, that is, a regular expression - * term starting with a backslash, which is not a backreference. - * - * Example: - * - * ``` - * \. - * \w - * ``` - */ -class RegExpEscape extends RegExpNormalChar { - RegExpEscape() { re.escapedCharacter(start, end) } - +/** An implementation that satisfies the RegexTreeView signature. */ +module Impl implements RegexTreeViewSig { /** - * Gets the name of the escaped; for example, `w` for `\w`. - * TODO: Handle named escapes. + * An element containing a regular expression term, that is, either + * a string literal (parsed as a regular expression) + * or another regular expression term. */ - override string getValue() { - not this.isUnicode() and - this.isIdentityEscape() and - result = this.getUnescaped() - or - this.getUnescaped() = "n" and result = "\n" - or - this.getUnescaped() = "r" and result = "\r" - or - this.getUnescaped() = "t" and result = "\t" - or - this.getUnescaped() = "f" and result = 12.toUnicode() - or - this.getUnescaped() = "v" and result = 11.toUnicode() - or - this.isUnicode() and - result = this.getUnicode() + class RegExpParent extends TRegExpParent { + /** Gets a textual representation of this element. */ + string toString() { result = "RegExpParent" } + + /** Gets the `i`th child term. */ + abstract RegExpTerm getChild(int i); + + /** Gets a child term . */ + RegExpTerm getAChild() { result = this.getChild(_) } + + /** Gets the number of child terms. */ + int getNumChild() { result = count(this.getAChild()) } + + /** Gets the associated regex. */ + abstract Regex getRegex(); } - /** Holds if this terms name is given by the part following the escape character. */ - predicate isIdentityEscape() { not this.getUnescaped() in ["n", "r", "t", "f"] } + /** A string literal used as a regular expression */ + class RegExpLiteral extends TRegExpLiteral, RegExpParent { + Regex re; - override string getPrimaryQLClass() { result = "RegExpEscape" } + RegExpLiteral() { this = TRegExpLiteral(re) } - /** Gets the part of the term following the escape character. That is e.g. "w" if the term is "\w". */ - string getUnescaped() { result = this.getText().suffix(1) } + override RegExpTerm getChild(int i) { i = 0 and result.getRegex() = re and result.isRootTerm() } + + /** Holds if dot, `.`, matches all characters, including newlines. */ + predicate isDotAll() { re.getAMode() = "DOTALL" } + + /** Holds if this regex matching is case-insensitive for this regex. */ + predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" } + + /** Get a string representing all modes for this regex. */ + string getFlags() { result = concat(string mode | mode = re.getAMode() | mode, " | ") } + + override Regex getRegex() { result = re } + + /** Gets the primary QL class for this regex. */ + string getPrimaryQLClass() { result = "RegExpLiteral" } + } /** - * Gets the text for this escape. That is e.g. "\w". + * A regular expression term, that is, a syntactic part of a regular expression. */ - private string getText() { result = re.getText().substring(start, end) } + class RegExpTerm extends RegExpParent { + Regex re; + int start; + int end; - /** - * Holds if this is a unicode escape. - */ - private predicate isUnicode() { this.getText().prefix(2) = ["\\u", "\\U"] } + RegExpTerm() { + this = TRegExpAlt(re, start, end) + or + this = TRegExpBackRef(re, start, end) + or + this = TRegExpCharacterClass(re, start, end) + or + this = TRegExpCharacterRange(re, start, end) + or + this = TRegExpNormalChar(re, start, end) + or + this = TRegExpGroup(re, start, end) + or + this = TRegExpQuantifier(re, start, end) + or + this = TRegExpSequence(re, start, end) + or + this = TRegExpSpecialChar(re, start, end) + } - /** - * Gets the unicode char for this escape. - * E.g. for `\u0061` this returns "a". - */ - private string getUnicode() { - exists(int codepoint | codepoint = sum(this.getHexValueFromUnicode(_)) | - result = codepoint.toUnicode() - ) + /** + * Gets the outermost term of this regular expression. + */ + RegExpTerm getRootTerm() { + this.isRootTerm() and result = this + or + result = this.getParent().(RegExpTerm).getRootTerm() + } + + /** + * Holds if this term is part of a string literal + * that is interpreted as a regular expression. + */ + predicate isUsedAsRegExp() { any() } + + /** + * Holds if this is the root term of a regular expression. + */ + predicate isRootTerm() { start = 0 and end = re.getText().length() } + + override RegExpTerm getChild(int i) { + result = this.(RegExpAlt).getChild(i) + or + result = this.(RegExpBackRef).getChild(i) + or + result = this.(RegExpCharacterClass).getChild(i) + or + result = this.(RegExpCharacterRange).getChild(i) + or + result = this.(RegExpNormalChar).getChild(i) + or + result = this.(RegExpGroup).getChild(i) + or + result = this.(RegExpQuantifier).getChild(i) + or + result = this.(RegExpSequence).getChild(i) + or + result = this.(RegExpSpecialChar).getChild(i) + } + + /** + * Gets the parent term of this regular expression term, or the + * regular expression literal if this is the root term. + */ + RegExpParent getParent() { result.getAChild() = this } + + override Regex getRegex() { result = re } + + /** Gets the offset at which this term starts. */ + int getStart() { result = start } + + /** Gets the offset at which this term ends. */ + int getEnd() { result = end } + + override string toString() { result = re.getText().substring(start, end) } + + /** + * Gets the location of the surrounding regex, as locations inside the regex do not exist. + * To get location information corresponding to the term inside the regex, + * use `hasLocationInfo`. + */ + Location getLocation() { result = re.getLocation() } + + /** Holds if this term is found at the specified location offsets. */ + predicate hasLocationInfo( + string filepath, int startline, int startcolumn, int endline, int endcolumn + ) { + exists(int re_start, int re_end | + re.getLocation().hasLocationInfo(filepath, startline, re_start, endline, re_end) and + startcolumn = re_start + start + 4 and + endcolumn = re_start + end + 3 + ) + } + + /** Gets the file in which this term is found. */ + File getFile() { result = this.getLocation().getFile() } + + /** Gets the raw source text of this term. */ + string getRawValue() { result = this.toString() } + + /** Gets the string literal in which this term is found. */ + RegExpLiteral getLiteral() { result = TRegExpLiteral(re) } + + /** Gets the regular expression term that is matched (textually) before this one, if any. */ + RegExpTerm getPredecessor() { + exists(RegExpTerm parent | parent = this.getParent() | + result = parent.(RegExpSequence).previousElement(this) + or + not exists(parent.(RegExpSequence).previousElement(this)) and + not parent instanceof RegExpSubPattern and + result = parent.getPredecessor() + ) + } + + /** Gets the regular expression term that is matched (textually) after this one, if any. */ + RegExpTerm getSuccessor() { + exists(RegExpTerm parent | parent = this.getParent() | + result = parent.(RegExpSequence).nextElement(this) + or + not exists(parent.(RegExpSequence).nextElement(this)) and + not parent instanceof RegExpSubPattern and + result = parent.getSuccessor() + ) + } + + /** Gets the primary QL class for this term. */ + string getPrimaryQLClass() { result = "RegExpTerm" } } /** - * Gets int value for the `index`th char in the hex number of the unicode escape. - * E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex). - */ - private int getHexValueFromUnicode(int index) { - this.isUnicode() and - exists(string hex, string char | hex = this.getText().suffix(2) | - char = hex.charAt(index) and - result = 16.pow(hex.length() - index - 1) * toHex(char) - ) - } -} - -/** - * Gets the hex number for the `hex` char. - */ -private int toHex(string hex) { - hex = [0 .. 9].toString() and - result = hex.toInt() - or - result = 10 and hex = ["a", "A"] - or - result = 11 and hex = ["b", "B"] - or - result = 12 and hex = ["c", "C"] - or - result = 13 and hex = ["d", "D"] - or - result = 14 and hex = ["e", "E"] - or - result = 15 and hex = ["f", "F"] -} - -/** - * A word boundary, that is, a regular expression term of the form `\b`. - */ -class RegExpWordBoundary extends RegExpSpecialChar { - RegExpWordBoundary() { this.getChar() = "\\b" } -} - -/** - * A character class escape in a regular expression. - * That is, an escaped character that denotes multiple characters. - * - * Examples: - * - * ``` - * \w - * \S - * ``` - */ -class RegExpCharacterClassEscape extends RegExpEscape { - RegExpCharacterClassEscape() { this.getValue() in ["d", "D", "s", "S", "w", "W"] } - - override RegExpTerm getChild(int i) { none() } - - override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" } -} - -/** - * A character class in a regular expression. - * - * Examples: - * - * ``` - * [a-z_] - * [^<>&] - * ``` - */ -class RegExpCharacterClass extends RegExpTerm, TRegExpCharacterClass { - RegExpCharacterClass() { this = TRegExpCharacterClass(re, start, end) } - - /** Holds if this character class is inverted, matching the opposite of its content. */ - predicate isInverted() { re.getChar(start + 1) = "^" } - - /** Gets the `i`th char inside this charater class. */ - string getCharThing(int i) { result = re.getChar(i + start) } - - /** Holds if this character class can match anything. */ - predicate isUniversalClass() { - // [^] - this.isInverted() and not exists(this.getAChild()) - or - // [\w\W] and similar - not this.isInverted() and - exists(string cce1, string cce2 | - cce1 = this.getAChild().(RegExpCharacterClassEscape).getValue() and - cce2 = this.getAChild().(RegExpCharacterClassEscape).getValue() - | - cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase() - ) - } - - override RegExpTerm getChild(int i) { - i = 0 and - result.getRegex() = re and - exists(int itemStart, int itemEnd | - result.getStart() = itemStart and - re.char_set_start(start, itemStart) and - re.char_set_child(start, itemStart, itemEnd) and - result.getEnd() = itemEnd - ) - or - i > 0 and - result.getRegex() = re and - exists(int itemStart | itemStart = this.getChild(i - 1).getEnd() | - result.getStart() = itemStart and - re.char_set_child(start, itemStart, result.getEnd()) - ) - } - - override string getPrimaryQLClass() { result = "RegExpCharacterClass" } -} - -/** - * A character range in a character class in a regular expression. - * - * Example: - * - * ``` - * a-z - * ``` - */ -class RegExpCharacterRange extends RegExpTerm, TRegExpCharacterRange { - int lower_end; - int upper_start; - - RegExpCharacterRange() { - this = TRegExpCharacterRange(re, start, end) and - re.charRange(_, start, lower_end, upper_start, end) - } - - /** Holds if this range goes from `lo` to `hi`, in effect is `lo-hi`. */ - predicate isRange(string lo, string hi) { - lo = re.getText().substring(start, lower_end) and - hi = re.getText().substring(upper_start, end) - } - - override RegExpTerm getChild(int i) { - i = 0 and - result.getRegex() = re and - result.getStart() = start and - result.getEnd() = lower_end - or - i = 1 and - result.getRegex() = re and - result.getStart() = upper_start and - result.getEnd() = end - } - - override string getPrimaryQLClass() { result = "RegExpCharacterRange" } -} - -/** - * A normal character in a regular expression, that is, a character - * without special meaning. This includes escaped characters. - * - * Examples: - * ``` - * t - * \t - * ``` - */ -class RegExpNormalChar extends RegExpTerm, TRegExpNormalChar { - RegExpNormalChar() { this = TRegExpNormalChar(re, start, end) } - - /** - * Holds if this constant represents a valid Unicode character (as opposed - * to a surrogate code point that does not correspond to a character by itself.) - */ - predicate isCharacter() { any() } - - /** Gets the string representation of the char matched by this term. */ - string getValue() { result = re.getText().substring(start, end) } - - override RegExpTerm getChild(int i) { none() } - - override string getPrimaryQLClass() { result = "RegExpNormalChar" } -} - -/** - * A constant regular expression term, that is, a regular expression - * term matching a single string. Currently, this will always be a single character. - * - * Example: - * - * ``` - * a - * ``` - */ -class RegExpConstant extends RegExpTerm { - string value; - - RegExpConstant() { - this = TRegExpNormalChar(re, start, end) and - not this instanceof RegExpCharacterClassEscape and - // exclude chars in qualifiers - // TODO: push this into regex library - not exists(int qstart, int qend | re.qualifiedPart(_, qstart, qend, _, _) | - qstart <= start and end <= qend - ) and - value = this.(RegExpNormalChar).getValue() - } - - /** - * Holds if this constant represents a valid Unicode character (as opposed - * to a surrogate code point that does not correspond to a character by itself.) - */ - predicate isCharacter() { any() } - - /** Gets the string matched by this constant term. */ - string getValue() { result = value } - - override RegExpTerm getChild(int i) { none() } - - override string getPrimaryQLClass() { result = "RegExpConstant" } -} - -/** - * A grouped regular expression. - * - * Examples: - * - * ``` - * (ECMA|Java) - * (?:ECMA|Java) - * (?['"]) - * ``` - */ -class RegExpGroup extends RegExpTerm, TRegExpGroup { - RegExpGroup() { this = TRegExpGroup(re, start, end) } - - /** - * Gets the index of this capture group within the enclosing regular - * expression literal. + * A quantified regular expression term. * - * For example, in the regular expression `/((a?).)(?:b)/`, the - * group `((a?).)` has index 1, the group `(a?)` nested inside it - * has index 2, and the group `(?:b)` has no index, since it is - * not a capture group. + * Example: + * + * ``` + * ((ECMA|Java)[sS]cript)* + * ``` */ - int getNumber() { result = re.getGroupNumber(start, end) } + class RegExpQuantifier extends RegExpTerm, TRegExpQuantifier { + int part_end; + boolean may_repeat_forever; - /** Holds if this is a capture group. */ - predicate isCapture() { exists(this.getNumber()) } + RegExpQuantifier() { + this = TRegExpQuantifier(re, start, end) and + re.qualifiedPart(start, part_end, end, _, may_repeat_forever) + } - /** Holds if this is a named capture group. */ - predicate isNamed() { exists(this.getName()) } - - /** Gets the name of this capture group, if any. */ - string getName() { result = re.getGroupName(start, end) } - - override RegExpTerm getChild(int i) { - result.getRegex() = re and - i = 0 and - re.groupContents(start, end, result.getStart(), result.getEnd()) - } - - override string getPrimaryQLClass() { result = "RegExpGroup" } -} - -/** - * A special character in a regular expression. - * - * Examples: - * ``` - * ^ - * $ - * . - * ``` - */ -class RegExpSpecialChar extends RegExpTerm, TRegExpSpecialChar { - string char; - - RegExpSpecialChar() { - this = TRegExpSpecialChar(re, start, end) and - re.specialCharacter(start, end, char) - } - - /** - * Holds if this constant represents a valid Unicode character (as opposed - * to a surrogate code point that does not correspond to a character by itself.) - */ - predicate isCharacter() { any() } - - /** Gets the char for this term. */ - string getChar() { result = char } - - override RegExpTerm getChild(int i) { none() } - - override string getPrimaryQLClass() { result = "RegExpSpecialChar" } -} - -/** - * A dot regular expression. - * - * Example: - * - * ``` - * . - * ``` - */ -class RegExpDot extends RegExpSpecialChar { - RegExpDot() { this.getChar() = "." } - - override string getPrimaryQLClass() { result = "RegExpDot" } -} - -/** - * A dollar assertion `$` or `\Z` matching the end of a line. - * - * Example: - * - * ``` - * $ - * ``` - */ -class RegExpDollar extends RegExpSpecialChar { - RegExpDollar() { this.getChar() = ["$", "\\Z"] } - - override string getPrimaryQLClass() { result = "RegExpDollar" } -} - -/** - * A caret assertion `^` or `\A` matching the beginning of a line. - * - * Example: - * - * ``` - * ^ - * ``` - */ -class RegExpCaret extends RegExpSpecialChar { - RegExpCaret() { this.getChar() = ["^", "\\A"] } - - override string getPrimaryQLClass() { result = "RegExpCaret" } -} - -/** - * A zero-width match, that is, either an empty group or an assertion. - * - * Examples: - * ``` - * () - * (?=\w) - * ``` - */ -class RegExpZeroWidthMatch extends RegExpGroup { - RegExpZeroWidthMatch() { re.zeroWidthMatch(start, end) } - - override RegExpTerm getChild(int i) { none() } - - override string getPrimaryQLClass() { result = "RegExpZeroWidthMatch" } -} - -/** - * A zero-width lookahead or lookbehind assertion. - * - * Examples: - * - * ``` - * (?=\w) - * (?!\n) - * (?<=\.) - * (?` - * in a regular expression. - * - * Examples: - * - * ``` - * \1 - * (?P=quote) - * ``` - */ -class RegExpBackRef extends RegExpTerm, TRegExpBackRef { - RegExpBackRef() { this = TRegExpBackRef(re, start, end) } /** - * Gets the number of the capture group this back reference refers to, if any. + * A regular expression term that permits unlimited repetitions. */ - int getNumber() { result = re.getBackrefNumber(start, end) } + class InfiniteRepetitionQuantifier extends RegExpQuantifier { + InfiniteRepetitionQuantifier() { this.mayRepeatForever() } + } /** - * Gets the name of the capture group this back reference refers to, if any. + * A star-quantified term. + * + * Example: + * + * ``` + * \w* + * ``` */ - string getName() { result = re.getBackrefName(start, end) } + class RegExpStar extends InfiniteRepetitionQuantifier { + RegExpStar() { this.getQualifier().charAt(0) = "*" } - /** Gets the capture group this back reference refers to. */ - RegExpGroup getGroup() { - this.hasLiteralAndNumber(result.getLiteral(), result.getNumber()) or - this.hasLiteralAndName(result.getLiteral(), result.getName()) + override string getPrimaryQLClass() { result = "RegExpStar" } } - /** Join-order helper for `getGroup`. */ - pragma[nomagic] - private predicate hasLiteralAndNumber(RegExpLiteral literal, int number) { - literal = this.getLiteral() and - number = this.getNumber() + /** + * A plus-quantified term. + * + * Example: + * + * ``` + * \w+ + * ``` + */ + class RegExpPlus extends InfiniteRepetitionQuantifier { + RegExpPlus() { this.getQualifier().charAt(0) = "+" } + + override string getPrimaryQLClass() { result = "RegExpPlus" } } - /** Join-order helper for `getGroup`. */ - pragma[nomagic] - private predicate hasLiteralAndName(RegExpLiteral literal, string name) { - literal = this.getLiteral() and - name = this.getName() + /** + * An optional term. + * + * Example: + * + * ``` + * ;? + * ``` + */ + class RegExpOpt extends RegExpQuantifier { + RegExpOpt() { this.getQualifier().charAt(0) = "?" } + + override string getPrimaryQLClass() { result = "RegExpOpt" } } - override RegExpTerm getChild(int i) { none() } + /** + * A range-quantified term + * + * Examples: + * + * ``` + * \w{2,4} + * \w{2,} + * \w{2} + * ``` + */ + class RegExpRange extends RegExpQuantifier { + string upper; + string lower; - override string getPrimaryQLClass() { result = "RegExpBackRef" } + RegExpRange() { re.multiples(part_end, end, lower, upper) } + + /** Gets the string defining the upper bound of this range, if any. */ + string getUpper() { result = upper } + + /** Gets the string defining the lower bound of this range, if any. */ + string getLower() { result = lower } + + /** + * Gets the upper bound of the range, if any. + * + * If there is no upper bound, any number of repetitions is allowed. + * For a term of the form `r{lo}`, both the lower and the upper bound + * are `lo`. + */ + int getUpperBound() { result = this.getUpper().toInt() } + + /** Gets the lower bound of the range. */ + int getLowerBound() { result = this.getLower().toInt() } + + override string getPrimaryQLClass() { result = "RegExpRange" } + } + + /** + * A sequence term. + * + * Example: + * + * ``` + * (ECMA|Java)Script + * ``` + * + * This is a sequence with the elements `(ECMA|Java)` and `Script`. + */ + class RegExpSequence extends RegExpTerm, TRegExpSequence { + RegExpSequence() { this = TRegExpSequence(re, start, end) } + + override RegExpTerm getChild(int i) { result = seqChild(re, start, end, i) } + + /** Gets the element preceding `element` in this sequence. */ + RegExpTerm previousElement(RegExpTerm element) { element = this.nextElement(result) } + + /** Gets the element following `element` in this sequence. */ + RegExpTerm nextElement(RegExpTerm element) { + exists(int i | + element = this.getChild(i) and + result = this.getChild(i + 1) + ) + } + + override string getPrimaryQLClass() { result = "RegExpSequence" } + } + + /** + * An alternative term, that is, a term of the form `a|b`. + * + * Example: + * + * ``` + * ECMA|Java + * ``` + */ + class RegExpAlt extends RegExpTerm, TRegExpAlt { + RegExpAlt() { this = TRegExpAlt(re, start, end) } + + override RegExpTerm getChild(int i) { + i = 0 and + result.getRegex() = re and + result.getStart() = start and + exists(int part_end | + re.alternationOption(start, end, start, part_end) and + result.getEnd() = part_end + ) + or + i > 0 and + result.getRegex() = re and + exists(int part_start | + part_start = this.getChild(i - 1).getEnd() + 1 // allow for the | + | + result.getStart() = part_start and + re.alternationOption(start, end, part_start, result.getEnd()) + ) + } + + override string getPrimaryQLClass() { result = "RegExpAlt" } + } + + class RegExpCharEscape = RegExpEscape; + + /** + * An escaped regular expression term, that is, a regular expression + * term starting with a backslash, which is not a backreference. + * + * Example: + * + * ``` + * \. + * \w + * ``` + */ + class RegExpEscape extends RegExpNormalChar { + RegExpEscape() { re.escapedCharacter(start, end) } + + /** + * Gets the name of the escaped; for example, `w` for `\w`. + * TODO: Handle named escapes. + */ + override string getValue() { + not this.isUnicode() and + this.isIdentityEscape() and + result = this.getUnescaped() + or + this.getUnescaped() = "n" and result = "\n" + or + this.getUnescaped() = "r" and result = "\r" + or + this.getUnescaped() = "t" and result = "\t" + or + this.getUnescaped() = "f" and result = 12.toUnicode() + or + this.getUnescaped() = "v" and result = 11.toUnicode() + or + this.isUnicode() and + result = this.getUnicode() + } + + /** Holds if this terms name is given by the part following the escape character. */ + predicate isIdentityEscape() { not this.getUnescaped() in ["n", "r", "t", "f"] } + + override string getPrimaryQLClass() { result = "RegExpEscape" } + + /** Gets the part of the term following the escape character. That is e.g. "w" if the term is "\w". */ + string getUnescaped() { result = this.getText().suffix(1) } + + /** + * Gets the text for this escape. That is e.g. "\w". + */ + private string getText() { result = re.getText().substring(start, end) } + + /** + * Holds if this is a unicode escape. + */ + private predicate isUnicode() { this.getText().prefix(2) = ["\\u", "\\U"] } + + /** + * Gets the unicode char for this escape. + * E.g. for `\u0061` this returns "a". + */ + private string getUnicode() { + exists(int codepoint | codepoint = sum(this.getHexValueFromUnicode(_)) | + result = codepoint.toUnicode() + ) + } + + /** + * Gets int value for the `index`th char in the hex number of the unicode escape. + * E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex). + */ + private int getHexValueFromUnicode(int index) { + this.isUnicode() and + exists(string hex, string char | hex = this.getText().suffix(2) | + char = hex.charAt(index) and + result = 16.pow(hex.length() - index - 1) * toHex(char) + ) + } + } + + /** + * Gets the hex number for the `hex` char. + */ + private int toHex(string hex) { + hex = [0 .. 9].toString() and + result = hex.toInt() + or + result = 10 and hex = ["a", "A"] + or + result = 11 and hex = ["b", "B"] + or + result = 12 and hex = ["c", "C"] + or + result = 13 and hex = ["d", "D"] + or + result = 14 and hex = ["e", "E"] + or + result = 15 and hex = ["f", "F"] + } + + /** + * A word boundary, that is, a regular expression term of the form `\b`. + */ + class RegExpWordBoundary extends RegExpSpecialChar { + RegExpWordBoundary() { this.getChar() = "\\b" } + } + + /** + * A character class escape in a regular expression. + * That is, an escaped character that denotes multiple characters. + * + * Examples: + * + * ``` + * \w + * \S + * ``` + */ + class RegExpCharacterClassEscape extends RegExpEscape { + RegExpCharacterClassEscape() { this.getValue() in ["d", "D", "s", "S", "w", "W"] } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" } + } + + /** + * A character class in a regular expression. + * + * Examples: + * + * ``` + * [a-z_] + * [^<>&] + * ``` + */ + class RegExpCharacterClass extends RegExpTerm, TRegExpCharacterClass { + RegExpCharacterClass() { this = TRegExpCharacterClass(re, start, end) } + + /** Holds if this character class is inverted, matching the opposite of its content. */ + predicate isInverted() { re.getChar(start + 1) = "^" } + + /** Gets the `i`th char inside this charater class. */ + string getCharThing(int i) { result = re.getChar(i + start) } + + /** Holds if this character class can match anything. */ + predicate isUniversalClass() { + // [^] + this.isInverted() and not exists(this.getAChild()) + or + // [\w\W] and similar + not this.isInverted() and + exists(string cce1, string cce2 | + cce1 = this.getAChild().(RegExpCharacterClassEscape).getValue() and + cce2 = this.getAChild().(RegExpCharacterClassEscape).getValue() + | + cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase() + ) + } + + override RegExpTerm getChild(int i) { + i = 0 and + result.getRegex() = re and + exists(int itemStart, int itemEnd | + result.getStart() = itemStart and + re.char_set_start(start, itemStart) and + re.char_set_child(start, itemStart, itemEnd) and + result.getEnd() = itemEnd + ) + or + i > 0 and + result.getRegex() = re and + exists(int itemStart | itemStart = this.getChild(i - 1).getEnd() | + result.getStart() = itemStart and + re.char_set_child(start, itemStart, result.getEnd()) + ) + } + + override string getPrimaryQLClass() { result = "RegExpCharacterClass" } + } + + /** + * A character range in a character class in a regular expression. + * + * Example: + * + * ``` + * a-z + * ``` + */ + class RegExpCharacterRange extends RegExpTerm, TRegExpCharacterRange { + int lower_end; + int upper_start; + + RegExpCharacterRange() { + this = TRegExpCharacterRange(re, start, end) and + re.charRange(_, start, lower_end, upper_start, end) + } + + /** Holds if this range goes from `lo` to `hi`, in effect is `lo-hi`. */ + predicate isRange(string lo, string hi) { + lo = re.getText().substring(start, lower_end) and + hi = re.getText().substring(upper_start, end) + } + + override RegExpTerm getChild(int i) { + i = 0 and + result.getRegex() = re and + result.getStart() = start and + result.getEnd() = lower_end + or + i = 1 and + result.getRegex() = re and + result.getStart() = upper_start and + result.getEnd() = end + } + + override string getPrimaryQLClass() { result = "RegExpCharacterRange" } + } + + /** + * A normal character in a regular expression, that is, a character + * without special meaning. This includes escaped characters. + * + * Examples: + * ``` + * t + * \t + * ``` + */ + class RegExpNormalChar extends RegExpTerm, TRegExpNormalChar { + RegExpNormalChar() { this = TRegExpNormalChar(re, start, end) } + + /** + * Holds if this constant represents a valid Unicode character (as opposed + * to a surrogate code point that does not correspond to a character by itself.) + */ + predicate isCharacter() { any() } + + /** Gets the string representation of the char matched by this term. */ + string getValue() { result = re.getText().substring(start, end) } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpNormalChar" } + } + + /** + * A constant regular expression term, that is, a regular expression + * term matching a single string. Currently, this will always be a single character. + * + * Example: + * + * ``` + * a + * ``` + */ + class RegExpConstant extends RegExpTerm { + string value; + + RegExpConstant() { + this = TRegExpNormalChar(re, start, end) and + not this instanceof RegExpCharacterClassEscape and + // exclude chars in qualifiers + // TODO: push this into regex library + not exists(int qstart, int qend | re.qualifiedPart(_, qstart, qend, _, _) | + qstart <= start and end <= qend + ) and + value = this.(RegExpNormalChar).getValue() + } + + /** + * Holds if this constant represents a valid Unicode character (as opposed + * to a surrogate code point that does not correspond to a character by itself.) + */ + predicate isCharacter() { any() } + + /** Gets the string matched by this constant term. */ + string getValue() { result = value } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpConstant" } + } + + /** + * A grouped regular expression. + * + * Examples: + * + * ``` + * (ECMA|Java) + * (?:ECMA|Java) + * (?['"]) + * ``` + */ + class RegExpGroup extends RegExpTerm, TRegExpGroup { + RegExpGroup() { this = TRegExpGroup(re, start, end) } + + /** + * Gets the index of this capture group within the enclosing regular + * expression literal. + * + * For example, in the regular expression `/((a?).)(?:b)/`, the + * group `((a?).)` has index 1, the group `(a?)` nested inside it + * has index 2, and the group `(?:b)` has no index, since it is + * not a capture group. + */ + int getNumber() { result = re.getGroupNumber(start, end) } + + /** Holds if this is a capture group. */ + predicate isCapture() { exists(this.getNumber()) } + + /** Holds if this is a named capture group. */ + predicate isNamed() { exists(this.getName()) } + + /** Gets the name of this capture group, if any. */ + string getName() { result = re.getGroupName(start, end) } + + override RegExpTerm getChild(int i) { + result.getRegex() = re and + i = 0 and + re.groupContents(start, end, result.getStart(), result.getEnd()) + } + + override string getPrimaryQLClass() { result = "RegExpGroup" } + } + + /** + * A special character in a regular expression. + * + * Examples: + * ``` + * ^ + * $ + * . + * ``` + */ + class RegExpSpecialChar extends RegExpTerm, TRegExpSpecialChar { + string char; + + RegExpSpecialChar() { + this = TRegExpSpecialChar(re, start, end) and + re.specialCharacter(start, end, char) + } + + /** + * Holds if this constant represents a valid Unicode character (as opposed + * to a surrogate code point that does not correspond to a character by itself.) + */ + predicate isCharacter() { any() } + + /** Gets the char for this term. */ + string getChar() { result = char } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpSpecialChar" } + } + + /** + * A dot regular expression. + * + * Example: + * + * ``` + * . + * ``` + */ + class RegExpDot extends RegExpSpecialChar { + RegExpDot() { this.getChar() = "." } + + override string getPrimaryQLClass() { result = "RegExpDot" } + } + + /** + * A dollar assertion `$` or `\Z` matching the end of a line. + * + * Example: + * + * ``` + * $ + * ``` + */ + class RegExpDollar extends RegExpSpecialChar { + RegExpDollar() { this.getChar() = ["$", "\\Z"] } + + override string getPrimaryQLClass() { result = "RegExpDollar" } + } + + /** + * A caret assertion `^` or `\A` matching the beginning of a line. + * + * Example: + * + * ``` + * ^ + * ``` + */ + class RegExpCaret extends RegExpSpecialChar { + RegExpCaret() { this.getChar() = ["^", "\\A"] } + + override string getPrimaryQLClass() { result = "RegExpCaret" } + } + + /** + * A zero-width match, that is, either an empty group or an assertion. + * + * Examples: + * ``` + * () + * (?=\w) + * ``` + */ + class RegExpZeroWidthMatch extends RegExpGroup { + RegExpZeroWidthMatch() { re.zeroWidthMatch(start, end) } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpZeroWidthMatch" } + } + + /** + * A zero-width lookahead or lookbehind assertion. + * + * Examples: + * + * ``` + * (?=\w) + * (?!\n) + * (?<=\.) + * (?` + * in a regular expression. + * + * Examples: + * + * ``` + * \1 + * (?P=quote) + * ``` + */ + class RegExpBackRef extends RegExpTerm, TRegExpBackRef { + RegExpBackRef() { this = TRegExpBackRef(re, start, end) } + + /** + * Gets the number of the capture group this back reference refers to, if any. + */ + int getNumber() { result = re.getBackrefNumber(start, end) } + + /** + * Gets the name of the capture group this back reference refers to, if any. + */ + string getName() { result = re.getBackrefName(start, end) } + + /** Gets the capture group this back reference refers to. */ + RegExpGroup getGroup() { + this.hasLiteralAndNumber(result.getLiteral(), result.getNumber()) or + this.hasLiteralAndName(result.getLiteral(), result.getName()) + } + + /** Join-order helper for `getGroup`. */ + pragma[nomagic] + private predicate hasLiteralAndNumber(RegExpLiteral literal, int number) { + literal = this.getLiteral() and + number = this.getNumber() + } + + /** Join-order helper for `getGroup`. */ + pragma[nomagic] + private predicate hasLiteralAndName(RegExpLiteral literal, string name) { + literal = this.getLiteral() and + name = this.getName() + } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpBackRef" } + } + + class Top = RegExpParent; + + /** + * Holds if `term` is an escape class representing e.g. `\d`. + * `clazz` is which character class it represents, e.g. "d" for `\d`. + */ + predicate isEscapeClass(RegExpTerm term, string clazz) { + exists(RegExpCharacterClassEscape escape | term = escape | escape.getValue() = clazz) + } + + /** + * Holds if `term` is a possessive quantifier. + * As python's regexes do not support possessive quantifiers, this never holds, but is used by the shared library. + */ + predicate isPossessive(RegExpQuantifier term) { none() } + + /** + * Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against. + * Not yet implemented for Python. + */ + predicate matchesAnyPrefix(RegExpTerm term) { any() } + + /** + * Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against. + * Not yet implemented for Python. + */ + predicate matchesAnySuffix(RegExpTerm term) { any() } + + /** + * Holds if the regular expression should not be considered. + * + * We make the pragmatic performance optimization to ignore regular expressions in files + * that does not belong to the project code (such as installed dependencies). + */ + predicate isExcluded(RegExpParent parent) { + not exists(parent.getRegex().getLocation().getFile().getRelativePath()) + or + // Regexes with many occurrences of ".*" may cause the polynomial ReDoS computation to explode, so + // we explicitly exclude these. + count(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10 + } + + /** + * Holds if `root` has the `i` flag for case-insensitive matching. + */ + predicate isIgnoreCase(RegExpTerm root) { + root.isRootTerm() and + root.getLiteral().isIgnoreCase() + } + + /** + * Holds if `root` has the `s` flag for multi-line matching. + */ + predicate isDotAll(RegExpTerm root) { + root.isRootTerm() and + root.getLiteral().isDotAll() + } } - -/** Gets the parse tree resulting from parsing `re`, if such has been constructed. */ -RegExpTerm getParsedRegExp(StrConst re) { result.getRegex() = re and result.isRootTerm() } diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowImplConsistency.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowImplConsistency.qll index dde16ab5a2a..f681e90aa21 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowImplConsistency.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowImplConsistency.qll @@ -136,6 +136,18 @@ module Consistency { msg = "Local flow step does not preserve enclosing callable." } + query predicate readStepIsLocal(Node n1, Node n2, string msg) { + readStep(n1, _, n2) and + nodeGetEnclosingCallable(n1) != nodeGetEnclosingCallable(n2) and + msg = "Read step does not preserve enclosing callable." + } + + query predicate storeStepIsLocal(Node n1, Node n2, string msg) { + storeStep(n1, _, n2) and + nodeGetEnclosingCallable(n1) != nodeGetEnclosingCallable(n2) and + msg = "Store step does not preserve enclosing callable." + } + private DataFlowType typeRepr() { result = getNodeType(_) } query predicate compatibleTypesReflexive(DataFlowType t, string msg) { diff --git a/python/ql/lib/semmle/python/security/BadTagFilterQuery.qll b/python/ql/lib/semmle/python/security/BadTagFilterQuery.qll index 95bfbeeeb5d..446e4487a0f 100644 --- a/python/ql/lib/semmle/python/security/BadTagFilterQuery.qll +++ b/python/ql/lib/semmle/python/security/BadTagFilterQuery.qll @@ -2,155 +2,7 @@ * Provides predicates for reasoning about bad tag filter vulnerabilities. */ -import regexp.RegexpMatching - -/** - * Holds if the regexp `root` should be tested against `str`. - * Implements the `isRegexpMatchingCandidateSig` signature from `RegexpMatching`. - * `ignorePrefix` toggles whether the regular expression should be treated as accepting any prefix if it's unanchored. - * `testWithGroups` toggles whether it's tested which groups are filled by a given input string. - */ -private predicate isBadTagFilterCandidate( - RootTerm root, string str, boolean ignorePrefix, boolean testWithGroups -) { - // the regexp must mention "<" and ">" explicitly. - forall(string angleBracket | angleBracket = ["<", ">"] | - any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() = - root - ) and - ignorePrefix = true and - ( - str = ["", "", "", "", "", - "", "", " ", " ", - " ", "", - "", "", "", - "", "", "", - "", "" - ] and - testWithGroups = false - ) -} - -/** - * A regexp that matches some string from the `isBadTagFilterCandidate` predicate. - */ -class HtmlMatchingRegExp extends RootTerm { - HtmlMatchingRegExp() { RegexpMatching ::matches(this, _) } - - /** Holds if this regexp matched `str`, where `str` is one of the string from `isBadTagFilterCandidate`. */ - predicate matches(string str) { RegexpMatching ::matches(this, str) } - - /** Holds if this regexp fills capture group `g' when matching `str', where `str` is one of the string from `isBadTagFilterCandidate`. */ - predicate fillsCaptureGroup(string str, int g) { - RegexpMatching ::fillsCaptureGroup(this, str, g) - } -} - -/** DEPRECATED: Alias for HtmlMatchingRegExp */ -deprecated class HTMLMatchingRegExp = HtmlMatchingRegExp; - -/** - * Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match. - * - * When adding a new case to this predicate, make sure the test string used in `matches(..)` calls are present in `HTMLMatchingRegExp::test` / `HTMLMatchingRegExp::testWithGroups`. - */ -predicate isBadRegexpFilter(HtmlMatchingRegExp regexp, string msg) { - // CVE-2021-33829 - matching both "" and "", but in different capture groups - regexp.matches("") and - regexp.matches("") and - exists(int a, int b | a != b | - regexp.fillsCaptureGroup("", a) and - // might be ambiguously parsed (matching both capture groups), and that is ok here. - regexp.fillsCaptureGroup("", b) and - not regexp.fillsCaptureGroup("", a) and - msg = - "Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group " - + a + " and comments ending with --!> are matched with capture group " + - strictconcat(int i | regexp.fillsCaptureGroup("", i) | i.toString(), ", ") + - "." - ) - or - // CVE-2020-17480 - matching "" and other tags, but not "". - exists(int group, int other | - group != other and - regexp.fillsCaptureGroup("", group) and - regexp.fillsCaptureGroup(" ", other) and - not regexp.matches("") and - not regexp.fillsCaptureGroup("", any(int i | i != group)) and - not regexp.fillsCaptureGroup("", group) and - not regexp.fillsCaptureGroup(" ", group) and - not regexp.fillsCaptureGroup("") and - regexp.matches("") and - not regexp.matches(" ") and - ( - not regexp.matches("") and - msg = "This regular expression matches , but not " - or - not regexp.matches("") and - msg = "This regular expression matches , but not " - ) - or - regexp.matches("") and - regexp.matches("") and - not regexp.matches("") and - not regexp.matches(" ") and - msg = "This regular expression does not match script tags where the attribute uses single-quotes." - or - regexp.matches("") and - regexp.matches("") and - not regexp.matches("") and - not regexp.matches(" ") and - msg = "This regular expression does not match script tags where the attribute uses double-quotes." - or - regexp.matches("") and - regexp.matches("") and - not regexp.matches("") and - not regexp.matches(" ") and - not regexp.matches(" ") and - msg = "This regular expression does not match script tags where tabs are used between attributes." - or - regexp.matches("") and - not RegExpFlags::isIgnoreCase(regexp) and - not regexp.matches(" ") and - not regexp.matches(" ") and - ( - not regexp.matches("") and - msg = "This regular expression does not match upper case ") and - regexp.matches("") and - msg = "This regular expression does not match mixed case ") and - not regexp.matches(" ") and - not regexp.matches(" ") and - ( - not regexp.matches("") and - msg = "This regular expression does not match script end tags like ." - or - not regexp.matches("") and - msg = "This regular expression does not match script end tags like ." - or - not regexp.matches("") and - msg = "This regular expression does not match script end tags like ." - ) -} +private import semmle.python.RegexTreeView::RegexTreeView as TreeView +// BadTagFilterQuery should be used directly from the shared pack, and not from this file. +deprecated import codeql.regex.nfa.BadTagFilterQuery::Make as Dep +import Dep diff --git a/python/ql/lib/semmle/python/security/OverlyLargeRangeQuery.qll b/python/ql/lib/semmle/python/security/OverlyLargeRangeQuery.qll index 65e662f0bc5..49ec333161c 100644 --- a/python/ql/lib/semmle/python/security/OverlyLargeRangeQuery.qll +++ b/python/ql/lib/semmle/python/security/OverlyLargeRangeQuery.qll @@ -2,288 +2,7 @@ * Classes and predicates for working with suspicious character ranges. */ -// We don't need the NFA utils, just the regexp tree. -// but the below is a nice shared library that exposes the API we need. -import regexp.NfaUtils - -/** - * Gets a rank for `range` that is unique for ranges in the same file. - * Prioritizes ranges that match more characters. - */ -int rankRange(RegExpCharacterRange range) { - range = - rank[result](RegExpCharacterRange r, Location l, int low, int high | - r.getLocation() = l and - isRange(r, low, high) - | - r order by (high - low) desc, l.getStartLine(), l.getStartColumn() - ) -} - -/** Holds if `range` spans from the unicode code points `low` to `high` (both inclusive). */ -predicate isRange(RegExpCharacterRange range, int low, int high) { - exists(string lowc, string highc | - range.isRange(lowc, highc) and - low.toUnicode() = lowc and - high.toUnicode() = highc - ) -} - -/** Holds if `char` is an alpha-numeric character. */ -predicate isAlphanumeric(string char) { - // written like this to avoid having a bindingset for the predicate - char = [[48 .. 57], [65 .. 90], [97 .. 122]].toUnicode() // 0-9, A-Z, a-z -} - -/** - * Holds if the given ranges are from the same character class - * and there exists at least one character matched by both ranges. - */ -predicate overlap(RegExpCharacterRange a, RegExpCharacterRange b) { - exists(RegExpCharacterClass clz | - a = clz.getAChild() and - b = clz.getAChild() and - a != b - | - exists(int alow, int ahigh, int blow, int bhigh | - isRange(a, alow, ahigh) and - isRange(b, blow, bhigh) and - alow <= bhigh and - blow <= ahigh - ) - ) -} - -/** - * Holds if `range` overlaps with the char class `escape` from the same character class. - */ -predicate overlapsWithCharEscape(RegExpCharacterRange range, RegExpCharacterClassEscape escape) { - exists(RegExpCharacterClass clz, string low, string high | - range = clz.getAChild() and - escape = clz.getAChild() and - range.isRange(low, high) - | - escape.getValue() = "w" and - getInRange(low, high).regexpMatch("\\w") - or - escape.getValue() = "d" and - getInRange(low, high).regexpMatch("\\d") - or - escape.getValue() = "s" and - getInRange(low, high).regexpMatch("\\s") - ) -} - -/** Gets the unicode code point for a `char`. */ -bindingset[char] -int toCodePoint(string char) { result.toUnicode() = char } - -/** A character range that appears to be overly wide. */ -class OverlyWideRange extends RegExpCharacterRange { - OverlyWideRange() { - exists(int low, int high, int numChars | - isRange(this, low, high) and - numChars = (1 + high - low) and - this.getRootTerm().isUsedAsRegExp() and - numChars >= 10 - | - // across the Z-a range (which includes backticks) - toCodePoint("Z") >= low and - toCodePoint("a") <= high - or - // across the 9-A range (which includes e.g. ; and ?) - toCodePoint("9") >= low and - toCodePoint("A") <= high - or - // a non-alphanumeric char as part of the range boundaries - exists(int bound | bound = [low, high] | not isAlphanumeric(bound.toUnicode())) and - // while still being ascii - low < 128 and - high < 128 - ) and - // allowlist for known ranges - not this = allowedWideRanges() - } - - /** Gets a string representation of a character class that matches the same chars as this range. */ - string printEquivalent() { result = RangePrinter::printEquivalentCharClass(this) } -} - -/** Gets a range that should not be reported as an overly wide range. */ -RegExpCharacterRange allowedWideRanges() { - // ~ is the last printable ASCII character, it's used right in various wide ranges. - result.isRange(_, "~") - or - // the same with " " and "!". " " is the first printable character, and "!" is the first non-white-space printable character. - result.isRange([" ", "!"], _) - or - // the `[@-_]` range is intentional - result.isRange("@", "_") - or - // starting from the zero byte is a good indication that it's purposely matching a large range. - result.isRange(0.toUnicode(), _) -} - -/** Gets a char between (and including) `low` and `high`. */ -bindingset[low, high] -private string getInRange(string low, string high) { - result = [toCodePoint(low) .. toCodePoint(high)].toUnicode() -} - -/** A module computing an equivalent character class for an overly wide range. */ -module RangePrinter { - bindingset[char] - bindingset[result] - private string next(string char) { - exists(int prev, int next | - prev.toUnicode() = char and - next.toUnicode() = result and - next = prev + 1 - ) - } - - /** Gets the points where the parts of the pretty printed range should be cut off. */ - private string cutoffs() { result = ["A", "Z", "a", "z", "0", "9"] } - - /** Gets the char to use in the low end of a range for a given `cut` */ - private string lowCut(string cut) { - cut = ["A", "a", "0"] and - result = cut - or - cut = ["Z", "z", "9"] and - result = next(cut) - } - - /** Gets the char to use in the high end of a range for a given `cut` */ - private string highCut(string cut) { - cut = ["Z", "z", "9"] and - result = cut - or - cut = ["A", "a", "0"] and - next(result) = cut - } - - /** Gets the cutoff char used for a given `part` of a range when pretty-printing it. */ - private string cutoff(OverlyWideRange range, int part) { - exists(int low, int high | isRange(range, low, high) | - result = - rank[part + 1](string cut | - cut = cutoffs() and low < toCodePoint(cut) and toCodePoint(cut) < high - | - cut order by toCodePoint(cut) - ) - ) - } - - /** Gets the number of parts we should print for a given `range`. */ - private int parts(OverlyWideRange range) { result = 1 + count(cutoff(range, _)) } - - /** Holds if the given part of a range should span from `low` to `high`. */ - private predicate part(OverlyWideRange range, int part, string low, string high) { - // first part. - part = 0 and - ( - range.isRange(low, high) and - parts(range) = 1 - or - parts(range) >= 2 and - range.isRange(low, _) and - high = highCut(cutoff(range, part)) - ) - or - // middle - part >= 1 and - part < parts(range) - 1 and - low = lowCut(cutoff(range, part - 1)) and - high = highCut(cutoff(range, part)) - or - // last. - part = parts(range) - 1 and - low = lowCut(cutoff(range, part - 1)) and - range.isRange(_, high) - } - - /** Gets an escaped `char` for use in a character class. */ - bindingset[char] - private string escape(string char) { - exists(string reg | reg = "(\\[|\\]|\\\\|-|/)" | - if char.regexpMatch(reg) then result = "\\" + char else result = char - ) - } - - /** Gets a part of the equivalent range. */ - private string printEquivalentCharClass(OverlyWideRange range, int part) { - exists(string low, string high | part(range, part, low, high) | - if - isAlphanumeric(low) and - isAlphanumeric(high) - then result = low + "-" + high - else - result = - strictconcat(string char | char = getInRange(low, high) | escape(char) order by char) - ) - } - - /** Gets the entire pretty printed equivalent range. */ - string printEquivalentCharClass(OverlyWideRange range) { - result = - strictconcat(string r, int part | - r = "[" and part = -1 and exists(range) - or - r = printEquivalentCharClass(range, part) - or - r = "]" and part = parts(range) - | - r order by part - ) - } -} - -/** Gets a char range that is overly large because of `reason`. */ -RegExpCharacterRange getABadRange(string reason, int priority) { - result instanceof OverlyWideRange and - priority = 0 and - exists(string equiv | equiv = result.(OverlyWideRange).printEquivalent() | - if equiv.length() <= 50 - then reason = "is equivalent to " + equiv - else reason = "is equivalent to " + equiv.substring(0, 50) + "..." - ) - or - priority = 1 and - exists(RegExpCharacterRange other | - reason = "overlaps with " + other + " in the same character class" and - rankRange(result) < rankRange(other) and - overlap(result, other) - ) - or - priority = 2 and - exists(RegExpCharacterClassEscape escape | - reason = "overlaps with " + escape + " in the same character class" and - overlapsWithCharEscape(result, escape) - ) - or - reason = "is empty" and - priority = 3 and - exists(int low, int high | - isRange(result, low, high) and - low > high - ) -} - -/** Holds if `range` matches suspiciously many characters. */ -predicate problem(RegExpCharacterRange range, string reason) { - reason = - strictconcat(string m, int priority | - range = getABadRange(m, priority) - | - m, ", and " order by priority desc - ) and - // specifying a range using an escape is usually OK. - not range.getAChild() instanceof RegExpEscape and - // Unicode escapes in strings are interpreted before it turns into a regexp, - // so e.g. [\u0001-\uFFFF] will just turn up as a range between two constants. - // We therefore exclude these ranges. - range.getRootTerm().getParent() instanceof RegExpLiteral and - // is used as regexp (mostly for JS where regular expressions are parsed eagerly) - range.getRootTerm().isUsedAsRegExp() -} +private import semmle.python.RegexTreeView::RegexTreeView as TreeView +// OverlyLargeRangeQuery should be used directly from the shared pack, and not from this file. +deprecated import codeql.regex.OverlyLargeRangeQuery::Make as Dep +import Dep diff --git a/python/ql/lib/semmle/python/security/dataflow/PolynomialReDoSCustomizations.qll b/python/ql/lib/semmle/python/security/dataflow/PolynomialReDoSCustomizations.qll index 7e1a2c9561c..fac16730f2c 100644 --- a/python/ql/lib/semmle/python/security/dataflow/PolynomialReDoSCustomizations.qll +++ b/python/ql/lib/semmle/python/security/dataflow/PolynomialReDoSCustomizations.qll @@ -11,7 +11,7 @@ private import semmle.python.dataflow.new.TaintTracking private import semmle.python.Concepts private import semmle.python.dataflow.new.RemoteFlowSources private import semmle.python.dataflow.new.BarrierGuards -private import semmle.python.RegexTreeView +private import semmle.python.RegexTreeView::RegexTreeView as TreeView private import semmle.python.ApiGraphs /** @@ -20,6 +20,9 @@ private import semmle.python.ApiGraphs * vulnerabilities, as well as extension points for adding your own. */ module PolynomialReDoS { + private import TreeView + import codeql.regex.nfa.SuperlinearBackTracking::Make + /** * A data flow source for "polynomial regular expression denial of service (ReDoS)" vulnerabilities. */ diff --git a/python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll b/python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll index f49696c5bad..7bc61ee2aee 100644 --- a/python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll +++ b/python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll @@ -103,7 +103,7 @@ module HeuristicNames { */ string notSensitiveRegexp() { result = - "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((?::concretize(t) - ) -} - -/** Holds if `state` has exponential ReDoS */ -predicate hasReDoSResult = ReDoSPruning ::hasReDoSResult/4; +private import semmle.python.RegexTreeView::RegexTreeView as TreeView +// ExponentialBackTracking should be used directly from the shared pack, and not from this file. +deprecated private import codeql.regex.nfa.ExponentialBackTracking::Make as Dep +import Dep diff --git a/python/ql/lib/semmle/python/security/regexp/NfaUtils.qll b/python/ql/lib/semmle/python/security/regexp/NfaUtils.qll index 5ff0cb6a39e..942830d95b2 100644 --- a/python/ql/lib/semmle/python/security/regexp/NfaUtils.qll +++ b/python/ql/lib/semmle/python/security/regexp/NfaUtils.qll @@ -7,1332 +7,7 @@ * other queries that benefit from reasoning about NFAs. */ -import NfaUtilsSpecific - -/** - * Gets the char after `c` (from a simplified ASCII table). - */ -private string nextChar(string c) { exists(int code | code = ascii(c) | code + 1 = ascii(result)) } - -/** - * Gets an approximation for the ASCII code for `char`. - * Only the easily printable chars are included (so no newline, tab, null, etc). - */ -private int ascii(string char) { - char = - rank[result](string c | - c = - "! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" - .charAt(_) - ) -} - -/** - * Holds if `t` matches at least an epsilon symbol. - * - * That is, this term does not restrict the language of the enclosing regular expression. - * - * This is implemented as an under-approximation, and this predicate does not hold for sub-patterns in particular. - */ -predicate matchesEpsilon(RegExpTerm t) { - t instanceof RegExpStar - or - t instanceof RegExpOpt - or - t.(RegExpRange).getLowerBound() = 0 - or - exists(RegExpTerm child | - child = t.getAChild() and - matchesEpsilon(child) - | - t instanceof RegExpAlt or - t instanceof RegExpGroup or - t instanceof RegExpPlus or - t instanceof RegExpRange - ) - or - matchesEpsilon(t.(RegExpBackRef).getGroup()) - or - forex(RegExpTerm child | child = t.(RegExpSequence).getAChild() | matchesEpsilon(child)) -} - -/** - * A lookahead/lookbehind that matches the empty string. - */ -class EmptyPositiveSubPattern extends RegExpSubPattern { - EmptyPositiveSubPattern() { - ( - this instanceof RegExpPositiveLookahead - or - this instanceof RegExpPositiveLookbehind - ) and - matchesEpsilon(this.getOperand()) - } -} - -/** DEPRECATED: Use `EmptyPositiveSubPattern` instead. */ -deprecated class EmptyPositiveSubPatttern = EmptyPositiveSubPattern; - -/** - * A branch in a disjunction that is the root node in a literal, or a literal - * whose root node is not a disjunction. - */ -class RegExpRoot extends RegExpTerm { - RegExpRoot() { - exists(RegExpParent parent | - exists(RegExpAlt alt | - alt.isRootTerm() and - this = alt.getAChild() and - parent = alt.getParent() - ) - or - this.isRootTerm() and - not this instanceof RegExpAlt and - parent = this.getParent() - ) - } - - /** - * Holds if this root term is relevant to the ReDoS analysis. - */ - predicate isRelevant() { - // is actually used as a RegExp - this.isUsedAsRegExp() and - // not excluded for library specific reasons - not isExcluded(this.getRootTerm().getParent()) - } -} - -/** - * A constant in a regular expression that represents valid Unicode character(s). - */ -private class RegexpCharacterConstant extends RegExpConstant { - RegexpCharacterConstant() { this.isCharacter() } -} - -/** - * A regexp term that is relevant for this ReDoS analysis. - */ -class RelevantRegExpTerm extends RegExpTerm { - RelevantRegExpTerm() { getRoot(this).isRelevant() } -} - -/** - * Holds if `term` is the chosen canonical representative for all terms with string representation `str`. - * The string representation includes which flags are used with the regular expression. - * - * Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s. - * The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks. - */ -private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) { - term = - min(RelevantRegExpTerm t, Location loc, File file | - loc = t.getLocation() and - file = t.getFile() and - str = getCanonicalizationString(t) - | - t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn() - ) -} - -/** - * Gets a string representation of `term` that is used for canonicalization. - */ -private string getCanonicalizationString(RelevantRegExpTerm term) { - exists(string ignoreCase | - (if RegExpFlags::isIgnoreCase(term.getRootTerm()) then ignoreCase = "i" else ignoreCase = "") and - result = term.getRawValue() + "|" + ignoreCase - ) -} - -/** - * An abstract input symbol, representing a set of concrete characters. - */ -private newtype TInputSymbol = - /** An input symbol corresponding to character `c`. */ - Char(string c) { - c = - any(RegexpCharacterConstant cc | - cc instanceof RelevantRegExpTerm and - not RegExpFlags::isIgnoreCase(cc.getRootTerm()) - ).getValue().charAt(_) - or - // normalize everything to lower case if the regexp is case insensitive - c = - any(RegexpCharacterConstant cc, string char | - cc instanceof RelevantRegExpTerm and - RegExpFlags::isIgnoreCase(cc.getRootTerm()) and - char = cc.getValue().charAt(_) - | - char.toLowerCase() - ) - } or - /** - * An input symbol representing all characters matched by - * a (non-universal) character class that has string representation `charClassString`. - */ - CharClass(string charClassString) { - exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) | - recc instanceof RegExpCharacterClass and - not recc.(RegExpCharacterClass).isUniversalClass() - or - isEscapeClass(recc, _) - ) - } or - /** An input symbol representing all characters matched by `.`. */ - Dot() or - /** An input symbol representing all characters. */ - Any() or - /** An epsilon transition in the automaton. */ - Epsilon() - -/** - * Gets the the CharClass corresponding to the canonical representative `term`. - */ -private CharClass getCharClassForCanonicalTerm(RegExpTerm term) { - exists(string str | isCanonicalTerm(term, str) | result = CharClass(str)) -} - -/** - * Gets a char class that represents `term`, even when `term` is not the canonical representative. - */ -CharacterClass getCanonicalCharClass(RegExpTerm term) { - exists(string str | str = getCanonicalizationString(term) and result = CharClass(str)) -} - -/** - * Holds if `a` and `b` are input symbols from the same regexp. - */ -private predicate sharesRoot(InputSymbol a, InputSymbol b) { - exists(RegExpRoot root | - belongsTo(a, root) and - belongsTo(b, root) - ) -} - -/** - * Holds if the `a` is an input symbol from a regexp that has root `root`. - */ -private predicate belongsTo(InputSymbol a, RegExpRoot root) { - exists(State s | getRoot(s.getRepr()) = root | - delta(s, a, _) - or - delta(_, a, s) - ) -} - -/** - * An abstract input symbol, representing a set of concrete characters. - */ -class InputSymbol extends TInputSymbol { - InputSymbol() { not this instanceof Epsilon } - - /** - * Gets a string representation of this input symbol. - */ - string toString() { - this = Char(result) - or - this = CharClass(result) - or - this = Dot() and result = "." - or - this = Any() and result = "[^]" - } -} - -/** - * An abstract input symbol that represents a character class. - */ -abstract class CharacterClass extends InputSymbol { - /** - * Gets a character that is relevant for intersection-tests involving this - * character class. - * - * Specifically, this is any of the characters mentioned explicitly in the - * character class, offset by one if it is inverted. For character class escapes, - * the result is as if the class had been written out as a series of intervals. - * - * This set is large enough to ensure that for any two intersecting character - * classes, one contains a relevant character from the other. - */ - abstract string getARelevantChar(); - - /** - * Holds if this character class matches `char`. - */ - bindingset[char] - abstract predicate matches(string char); - - /** - * Gets a character matched by this character class. - */ - string choose() { result = this.getARelevantChar() and this.matches(result) } -} - -/** - * Provides implementations for `CharacterClass`. - */ -private module CharacterClasses { - /** - * Holds if the character class `cc` has a child (constant or range) that matches `char`. - */ - pragma[noinline] - predicate hasChildThatMatches(RegExpCharacterClass cc, string char) { - if RegExpFlags::isIgnoreCase(cc.getRootTerm()) - then - // normalize everything to lower case if the regexp is case insensitive - exists(string c | hasChildThatMatchesIgnoringCasingFlags(cc, c) | char = c.toLowerCase()) - else hasChildThatMatchesIgnoringCasingFlags(cc, char) - } - - /** - * Holds if the character class `cc` has a child (constant or range) that matches `char`. - * Ignores whether the character class is inside a regular expression that has the ignore case flag. - */ - pragma[noinline] - predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) { - exists(getCharClassForCanonicalTerm(cc)) and - exists(RegExpTerm child | child = cc.getAChild() | - char = child.(RegexpCharacterConstant).getValue() - or - rangeMatchesOnLetterOrDigits(child, char) - or - not rangeMatchesOnLetterOrDigits(child, _) and - char = getARelevantChar() and - exists(string lo, string hi | child.(RegExpCharacterRange).isRange(lo, hi) | - lo <= char and - char <= hi - ) - or - exists(string charClass | isEscapeClass(child, charClass) | - charClass.toLowerCase() = charClass and - classEscapeMatches(charClass, char) - or - char = getARelevantChar() and - charClass.toUpperCase() = charClass and - not classEscapeMatches(charClass, char) - ) - ) - } - - /** - * Holds if `range` is a range on lower-case, upper-case, or digits, and matches `char`. - * This predicate is used to restrict the searchspace for ranges by only joining `getAnyPossiblyMatchedChar` - * on a few ranges. - */ - private predicate rangeMatchesOnLetterOrDigits(RegExpCharacterRange range, string char) { - exists(string lo, string hi | - range.isRange(lo, hi) and lo = lowercaseLetter() and hi = lowercaseLetter() - | - lo <= char and - char <= hi and - char = lowercaseLetter() - ) - or - exists(string lo, string hi | - range.isRange(lo, hi) and lo = upperCaseLetter() and hi = upperCaseLetter() - | - lo <= char and - char <= hi and - char = upperCaseLetter() - ) - or - exists(string lo, string hi | range.isRange(lo, hi) and lo = digit() and hi = digit() | - lo <= char and - char <= hi and - char = digit() - ) - } - - private string lowercaseLetter() { result = "abcdefghijklmnopqrstuvwxyz".charAt(_) } - - private string upperCaseLetter() { result = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".charAt(_) } - - private string digit() { result = [0 .. 9].toString() } - - /** - * Gets a char that could be matched by a regular expression. - * Includes all printable ascii chars, all constants mentioned in a regexp, and all chars matches by the regexp `/\s|\d|\w/`. - */ - string getARelevantChar() { - exists(ascii(result)) - or - exists(RegexpCharacterConstant c | result = c.getValue().charAt(_)) - or - classEscapeMatches(_, result) - } - - /** - * Gets a char that is mentioned in the character class `c`. - */ - private string getAMentionedChar(RegExpCharacterClass c) { - exists(RegExpTerm child | child = c.getAChild() | - result = child.(RegexpCharacterConstant).getValue() - or - child.(RegExpCharacterRange).isRange(result, _) - or - child.(RegExpCharacterRange).isRange(_, result) - or - exists(string charClass | isEscapeClass(child, charClass) | - result = min(string s | classEscapeMatches(charClass.toLowerCase(), s)) - or - result = max(string s | classEscapeMatches(charClass.toLowerCase(), s)) - ) - ) - } - - bindingset[char, cc] - private string caseNormalize(string char, RegExpTerm cc) { - if RegExpFlags::isIgnoreCase(cc.getRootTerm()) - then result = char.toLowerCase() - else result = char - } - - /** - * An implementation of `CharacterClass` for positive (non inverted) character classes. - */ - private class PositiveCharacterClass extends CharacterClass { - RegExpCharacterClass cc; - - PositiveCharacterClass() { this = getCharClassForCanonicalTerm(cc) and not cc.isInverted() } - - override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) } - - override predicate matches(string char) { hasChildThatMatches(cc, char) } - } - - /** - * An implementation of `CharacterClass` for inverted character classes. - */ - private class InvertedCharacterClass extends CharacterClass { - RegExpCharacterClass cc; - - InvertedCharacterClass() { this = getCharClassForCanonicalTerm(cc) and cc.isInverted() } - - override string getARelevantChar() { - result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or - nextChar(result) = caseNormalize(getAMentionedChar(cc), cc) - } - - bindingset[char] - override predicate matches(string char) { not hasChildThatMatches(cc, char) } - } - - /** - * Holds if the character class escape `clazz` (\d, \s, or \w) matches `char`. - */ - pragma[noinline] - private predicate classEscapeMatches(string clazz, string char) { - clazz = "d" and - char = "0123456789".charAt(_) - or - clazz = "s" and - char = [" ", "\t", "\r", "\n", 11.toUnicode(), 12.toUnicode()] // 11.toUnicode() = \v, 12.toUnicode() = \f - or - clazz = "w" and - char = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_".charAt(_) - } - - /** - * An implementation of `CharacterClass` for \d, \s, and \w. - */ - private class PositiveCharacterClassEscape extends CharacterClass { - string charClass; - RegExpTerm cc; - - PositiveCharacterClassEscape() { - isEscapeClass(cc, charClass) and - this = getCharClassForCanonicalTerm(cc) and - charClass = ["d", "s", "w"] - } - - override string getARelevantChar() { - charClass = "d" and - result = ["0", "9"] - or - charClass = "s" and - result = " " - or - charClass = "w" and - if RegExpFlags::isIgnoreCase(cc.getRootTerm()) - then result = ["a", "z", "_", "0", "9"] - else result = ["a", "Z", "_", "0", "9"] - } - - override predicate matches(string char) { classEscapeMatches(charClass, char) } - - override string choose() { - charClass = "d" and - result = "9" - or - charClass = "s" and - result = " " - or - charClass = "w" and - result = "a" - } - } - - /** - * An implementation of `CharacterClass` for \D, \S, and \W. - */ - private class NegativeCharacterClassEscape extends CharacterClass { - string charClass; - - NegativeCharacterClassEscape() { - exists(RegExpTerm cc | - isEscapeClass(cc, charClass) and - this = getCharClassForCanonicalTerm(cc) and - charClass = ["D", "S", "W"] - ) - } - - override string getARelevantChar() { - charClass = "D" and - result = ["a", "Z", "!"] - or - charClass = "S" and - result = ["a", "9", "!"] - or - charClass = "W" and - result = [" ", "!"] - } - - bindingset[char] - override predicate matches(string char) { - not classEscapeMatches(charClass.toLowerCase(), char) - } - } - - /** Gets a representative for all char classes that match the same chars as `c`. */ - CharacterClass normalize(CharacterClass c) { - exists(string normalization | - normalization = getNormalizationString(c) and - result = - min(CharacterClass cc, string raw | - getNormalizationString(cc) = normalization and cc = CharClass(raw) - | - cc order by raw - ) - ) - } - - /** Gets a string representing all the chars matched by `c` */ - private string getNormalizationString(CharacterClass c) { - (c instanceof PositiveCharacterClass or c instanceof PositiveCharacterClassEscape) and - result = concat(string char | c.matches(char) and char = CharacterClasses::getARelevantChar()) - or - (c instanceof InvertedCharacterClass or c instanceof NegativeCharacterClassEscape) and - // the string produced by the concat can not contain repeated chars - // so by starting the below with "nn" we can guarantee that - // it will not overlap with the above case. - // and a negative char class can never match the same chars as a positive one, so we don't miss any results from this. - result = - "nn:" + - concat(string char | not c.matches(char) and char = CharacterClasses::getARelevantChar()) - } -} - -private class EdgeLabel extends TInputSymbol { - string toString() { - this = Epsilon() and result = "" - or - exists(InputSymbol s | this = s and result = s.toString()) - } -} - -/** - * A RegExp term that acts like a plus. - * Either it's a RegExpPlus, or it is a range {1,X} where X is >= 30. - * 30 has been chosen as a threshold because for exponential blowup 2^30 is enough to get a decent DOS attack. - */ -private class EffectivelyPlus extends RegExpTerm { - EffectivelyPlus() { - this instanceof RegExpPlus - or - exists(RegExpRange range | - range.getLowerBound() = 1 and - (range.getUpperBound() >= 30 or not exists(range.getUpperBound())) - | - this = range - ) - } -} - -/** - * A RegExp term that acts like a star. - * Either it's a RegExpStar, or it is a range {0,X} where X is >= 30. - */ -private class EffectivelyStar extends RegExpTerm { - EffectivelyStar() { - this instanceof RegExpStar - or - exists(RegExpRange range | - range.getLowerBound() = 0 and - (range.getUpperBound() >= 30 or not exists(range.getUpperBound())) - | - this = range - ) - } -} - -/** - * A RegExp term that acts like a question mark. - * Either it's a RegExpQuestion, or it is a range {0,1}. - */ -private class EffectivelyQuestion extends RegExpTerm { - EffectivelyQuestion() { - this instanceof RegExpOpt - or - exists(RegExpRange range | range.getLowerBound() = 0 and range.getUpperBound() = 1 | - this = range - ) - } -} - -/** - * Gets the state before matching `t`. - */ -pragma[inline] -private State before(RegExpTerm t) { result = Match(t, 0) } - -/** - * Gets a state the NFA may be in after matching `t`. - */ -State after(RegExpTerm t) { - exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt)) - or - exists(RegExpSequence seq, int i | t = seq.getChild(i) | - result = before(seq.getChild(i + 1)) - or - i + 1 = seq.getNumChild() and result = after(seq) - ) - or - exists(RegExpGroup grp | t = grp.getAChild() | result = after(grp)) - or - exists(EffectivelyStar star | t = star.getAChild() | - not isPossessive(star) and - result = before(star) - ) - or - exists(EffectivelyPlus plus | t = plus.getAChild() | - not isPossessive(plus) and - result = before(plus) - or - result = after(plus) - ) - or - exists(EffectivelyQuestion opt | t = opt.getAChild() | result = after(opt)) - or - exists(RegExpRoot root | t = root | - if matchesAnySuffix(root) then result = AcceptAnySuffix(root) else result = Accept(root) - ) -} - -/** - * Holds if the NFA has a transition from `q1` to `q2` labelled with `lbl`. - */ -predicate delta(State q1, EdgeLabel lbl, State q2) { - exists(RegexpCharacterConstant s, int i | - q1 = Match(s, i) and - ( - not RegExpFlags::isIgnoreCase(s.getRootTerm()) and - lbl = Char(s.getValue().charAt(i)) - or - // normalize everything to lower case if the regexp is case insensitive - RegExpFlags::isIgnoreCase(s.getRootTerm()) and - exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase())) - ) and - ( - q2 = Match(s, i + 1) - or - s.getValue().length() = i + 1 and - q2 = after(s) - ) - ) - or - exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) | - if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot() - ) - or - exists(RegExpCharacterClass cc | - cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc) - or - q1 = before(cc) and - lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and - q2 = after(cc) - ) - or - exists(RegExpTerm cc | isEscapeClass(cc, _) | - q1 = before(cc) and - lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and - q2 = after(cc) - ) - or - exists(RegExpAlt alt | lbl = Epsilon() | q1 = before(alt) and q2 = before(alt.getAChild())) - or - exists(RegExpSequence seq | lbl = Epsilon() | q1 = before(seq) and q2 = before(seq.getChild(0))) - or - exists(RegExpGroup grp | lbl = Epsilon() | q1 = before(grp) and q2 = before(grp.getChild(0))) - or - exists(EffectivelyStar star | lbl = Epsilon() | - q1 = before(star) and q2 = before(star.getChild(0)) - or - q1 = before(star) and q2 = after(star) - ) - or - exists(EffectivelyPlus plus | lbl = Epsilon() | - q1 = before(plus) and q2 = before(plus.getChild(0)) - ) - or - exists(EffectivelyQuestion opt | lbl = Epsilon() | - q1 = before(opt) and q2 = before(opt.getChild(0)) - or - q1 = before(opt) and q2 = after(opt) - ) - or - exists(RegExpRoot root | q1 = AcceptAnySuffix(root) | - lbl = Any() and q2 = q1 - or - lbl = Epsilon() and q2 = Accept(root) - ) - or - exists(RegExpRoot root | q1 = Match(root, 0) | matchesAnyPrefix(root) and lbl = Any() and q2 = q1) - or - exists(RegExpDollar dollar | q1 = before(dollar) | - lbl = Epsilon() and q2 = Accept(getRoot(dollar)) - ) - or - exists(EmptyPositiveSubPattern empty | q1 = before(empty) | lbl = Epsilon() and q2 = after(empty)) -} - -/** - * Gets a state that `q` has an epsilon transition to. - */ -State epsilonSucc(State q) { delta(q, Epsilon(), result) } - -/** - * Gets a state that has an epsilon transition to `q`. - */ -State epsilonPred(State q) { q = epsilonSucc(result) } - -/** - * Holds if there is a state `q` that can be reached from `q1` - * along epsilon edges, such that there is a transition from - * `q` to `q2` that consumes symbol `s`. - */ -predicate deltaClosed(State q1, InputSymbol s, State q2) { delta(epsilonSucc*(q1), s, q2) } - -/** - * Gets the root containing the given term, that is, the root of the literal, - * or a branch of the root disjunction. - */ -RegExpRoot getRoot(RegExpTerm term) { - result = term or - result = getRoot(term.getParent()) -} - -/** - * A state in the NFA. - */ -newtype TState = - /** - * A state representing that the NFA is about to match a term. - * `i` is used to index into multi-char literals. - */ - Match(RelevantRegExpTerm t, int i) { - i = 0 - or - exists(t.(RegexpCharacterConstant).getValue().charAt(i)) - } or - /** - * An accept state, where exactly the given input string is accepted. - */ - Accept(RegExpRoot l) { l.isRelevant() } or - /** - * An accept state, where the given input string, or any string that has this - * string as a prefix, is accepted. - */ - AcceptAnySuffix(RegExpRoot l) { l.isRelevant() } - -/** - * Gets a state that is about to match the regular expression `t`. - */ -State mkMatch(RegExpTerm t) { result = Match(t, 0) } - -/** - * A state in the NFA corresponding to a regular expression. - * - * Each regular expression literal `l` has one accepting state - * `Accept(l)`, one state that accepts all suffixes `AcceptAnySuffix(l)`, - * and a state `Match(t, i)` for every subterm `t`, - * which represents the state of the NFA before starting to - * match `t`, or the `i`th character in `t` if `t` is a constant. - */ -class State extends TState { - RegExpTerm repr; - - State() { - this = Match(repr, _) or - this = Accept(repr) or - this = AcceptAnySuffix(repr) - } - - /** - * Gets a string representation for this state in a regular expression. - */ - string toString() { - exists(int i | this = Match(repr, i) | result = "Match(" + repr + "," + i + ")") - or - this instanceof Accept and - result = "Accept(" + repr + ")" - or - this instanceof AcceptAnySuffix and - result = "AcceptAny(" + repr + ")" - } - - /** - * Gets the location for this state. - */ - Location getLocation() { result = repr.getLocation() } - - /** - * Gets the term represented by this state. - */ - RegExpTerm getRepr() { result = repr } -} - -/** - * Gets the minimum char that is matched by both the character classes `c` and `d`. - */ -private string getMinOverlapBetweenCharacterClasses(CharacterClass c, CharacterClass d) { - result = min(getAOverlapBetweenCharacterClasses(c, d)) -} - -/** - * Gets a char that is matched by both the character classes `c` and `d`. - * And `c` and `d` is not the same character class. - */ -private string getAOverlapBetweenCharacterClasses(CharacterClass c, CharacterClass d) { - sharesRoot(c, d) and - result = [c.getARelevantChar(), d.getARelevantChar()] and - c.matches(result) and - d.matches(result) and - not c = d -} - -/** - * Gets a character that is represented by both `c` and `d`. - */ -string intersect(InputSymbol c, InputSymbol d) { - (sharesRoot(c, d) or [c, d] = Any()) and - ( - c = Char(result) and - d = getAnInputSymbolMatching(result) - or - result = getMinOverlapBetweenCharacterClasses(c, d) - or - result = c.(CharacterClass).choose() and - ( - d = c - or - d = Dot() and - not (result = "\n" or result = "\r") - or - d = Any() - ) - or - (c = Dot() or c = Any()) and - (d = Dot() or d = Any()) and - result = "a" - ) - or - result = intersect(d, c) -} - -/** - * Gets a symbol that matches `char`. - */ -bindingset[char] -InputSymbol getAnInputSymbolMatching(string char) { - result = Char(char) - or - result.(CharacterClass).matches(char) - or - result = Dot() and - not (char = "\n" or char = "\r") - or - result = Any() -} - -/** - * Holds if `state` is a start state. - */ -predicate isStartState(State state) { - state = mkMatch(any(RegExpRoot r)) - or - exists(RegExpCaret car | state = after(car)) -} - -/** - * Holds if `state` is a candidate for ReDoS with string `pump`. - */ -signature predicate isCandidateSig(State state, string pump); - -/** - * Holds if `state` is a candidate for ReDoS. - */ -signature predicate isCandidateSig(State state); - -/** - * Predicates for constructing a prefix string that leads to a given state. - */ -module PrefixConstruction { - /** - * Holds if `state` is the textually last start state for the regular expression. - */ - private predicate lastStartState(RelevantState state) { - exists(RegExpRoot root | - state = - max(RelevantState s, Location l | - isStartState(s) and - getRoot(s.getRepr()) = root and - l = s.getRepr().getLocation() - | - s - order by - l.getStartLine(), l.getStartColumn(), s.getRepr().toString(), l.getEndColumn(), - l.getEndLine() - ) - ) - } - - /** - * Holds if there exists any transition (Epsilon() or other) from `a` to `b`. - */ - private predicate existsTransition(State a, State b) { delta(a, _, b) } - - /** - * Gets the minimum number of transitions it takes to reach `state` from the `start` state. - */ - int prefixLength(State start, State state) = - shortestDistances(lastStartState/1, existsTransition/2)(start, state, result) - - /** - * Gets the minimum number of transitions it takes to reach `state` from the start state. - */ - private int lengthFromStart(State state) { result = prefixLength(_, state) } - - /** - * Gets a string for which the regular expression will reach `state`. - * - * Has at most one result for any given `state`. - * This predicate will not always have a result even if there is a ReDoS issue in - * the regular expression. - */ - string prefix(State state) { - lastStartState(state) and - result = "" - or - // the search stops past the last redos candidate state. - lengthFromStart(state) <= max(lengthFromStart(any(State s | isCandidate(s)))) and - exists(State prev | - // select a unique predecessor (by an arbitrary measure) - prev = - min(State s, Location loc | - lengthFromStart(s) = lengthFromStart(state) - 1 and - loc = s.getRepr().getLocation() and - delta(s, _, state) - | - s - order by - loc.getStartLine(), loc.getStartColumn(), loc.getEndLine(), loc.getEndColumn(), - s.getRepr().toString() - ) - | - // greedy search for the shortest prefix - result = prefix(prev) and delta(prev, Epsilon(), state) - or - not delta(prev, Epsilon(), state) and - result = prefix(prev) + getCanonicalEdgeChar(prev, state) - ) - } - - /** - * Gets a canonical char for which there exists a transition from `prev` to `next` in the NFA. - */ - private string getCanonicalEdgeChar(State prev, State next) { - result = - min(string c | delta(prev, any(InputSymbol symbol | c = intersect(Any(), symbol)), next)) - } - - /** A state within a regular expression that contains a candidate state. */ - class RelevantState instanceof State { - RelevantState() { - exists(State s | isCandidate(s) | getRoot(s.getRepr()) = getRoot(this.getRepr())) - } - - /** Gets a string representation for this state in a regular expression. */ - string toString() { result = State.super.toString() } - - /** Gets the term represented by this state. */ - RegExpTerm getRepr() { result = State.super.getRepr() } - } -} - -/** - * A module for pruning candidate ReDoS states. - * The candidates are specified by the `isCandidate` signature predicate. - * The candidates are checked for rejecting suffixes and deduplicated, - * and the resulting ReDoS states are read by the `hasReDoSResult` predicate. - */ -module ReDoSPruning { - /** - * Holds if repeating `pump` starting at `state` is a candidate for causing backtracking. - * No check whether a rejected suffix exists has been made. - */ - private predicate isReDoSCandidate(State state, string pump) { - isCandidate(state, pump) and - not state = acceptsAnySuffix() and // pruning early - these can never get stuck in a rejecting state. - ( - not isCandidate(epsilonSucc+(state), _) - or - epsilonSucc+(state) = state and - state = - max(State s, Location l | - s = epsilonSucc+(state) and - l = s.getRepr().getLocation() and - isCandidate(s, _) and - s.getRepr() instanceof InfiniteRepetitionQuantifier - | - s order by l.getStartLine(), l.getStartColumn(), l.getEndColumn(), l.getEndLine() - ) - ) - } - - /** Gets a state that can reach the `accept-any` state using only epsilon steps. */ - private State acceptsAnySuffix() { epsilonSucc*(result) = AcceptAnySuffix(_) } - - predicate isCandidateState(State s) { isReDoSCandidate(s, _) } - - import PrefixConstruction as Prefix - - class RelevantState = Prefix::RelevantState; - - /** - * Predicates for testing the presence of a rejecting suffix. - * - * These predicates are used to ensure that the all states reached from the fork - * by repeating `w` have a rejecting suffix. - * - * For example, a regexp like `/^(a+)+/` will accept any string as long the prefix is - * some number of `"a"`s, and it is therefore not possible to construct a rejecting suffix. - * - * A regexp like `/(a+)+$/` or `/(a+)+b/` trivially has a rejecting suffix, - * as the suffix "X" will cause both the regular expressions to be rejected. - * - * The string `w` is repeated any number of times because it needs to be - * infinitely repeatable for the attack to work. - * For the regular expression `/((ab)+)*abab/` the accepting state is not reachable from the fork - * using epsilon transitions. But any attempt at repeating `w` will end in a state that accepts all suffixes. - */ - private module SuffixConstruction { - /** - * Holds if all states reachable from `fork` by repeating `w` - * are likely rejectable by appending some suffix. - */ - predicate reachesOnlyRejectableSuffixes(State fork, string w) { - isReDoSCandidate(fork, w) and - forex(State next | next = process(fork, w, w.length() - 1) | isLikelyRejectable(next)) and - not getProcessPrevious(fork, _, w) = acceptsAnySuffix() // we stop `process(..)` early if we can, check here if it happened. - } - - /** - * Holds if there likely exists a suffix starting from `s` that leads to the regular expression being rejected. - * This predicate might find impossible suffixes when searching for suffixes of length > 1, which can cause FPs. - */ - pragma[noinline] - private predicate isLikelyRejectable(RelevantState s) { - // exists a reject edge with some char. - hasRejectEdge(s) - or - hasEdgeToLikelyRejectable(s) - or - // stopping here is rejection - isRejectState(s) - } - - /** - * Holds if `s` is not an accept state, and there is no epsilon transition to an accept state. - */ - predicate isRejectState(RelevantState s) { not epsilonSucc*(s) = Accept(_) } - - /** - * Holds if there is likely a non-empty suffix leading to rejection starting in `s`. - */ - pragma[noopt] - predicate hasEdgeToLikelyRejectable(RelevantState s) { - // all edges (at least one) with some char leads to another state that is rejectable. - // the `next` states might not share a common suffix, which can cause FPs. - exists(string char | char = hasEdgeToLikelyRejectableHelper(s) | - // noopt to force `hasEdgeToLikelyRejectableHelper` to be first in the join-order. - exists(State next | deltaClosedChar(s, char, next) | isLikelyRejectable(next)) and - forall(State next | deltaClosedChar(s, char, next) | isLikelyRejectable(next)) - ) - } - - /** - * Gets a char for there exists a transition away from `s`, - * and `s` has not been found to be rejectable by `hasRejectEdge` or `isRejectState`. - */ - pragma[noinline] - private string hasEdgeToLikelyRejectableHelper(RelevantState s) { - not hasRejectEdge(s) and - not isRejectState(s) and - deltaClosedChar(s, result, _) - } - - /** - * Holds if there is a state `next` that can be reached from `prev` - * along epsilon edges, such that there is a transition from - * `prev` to `next` that the character symbol `char`. - */ - predicate deltaClosedChar(RelevantState prev, string char, RelevantState next) { - deltaClosed(prev, getAnInputSymbolMatchingRelevant(char), next) - } - - pragma[noinline] - InputSymbol getAnInputSymbolMatchingRelevant(string char) { - char = relevant(_) and - result = getAnInputSymbolMatching(char) - } - - pragma[noinline] - RegExpRoot relevantRoot() { - exists(RegExpTerm term, State s | - s.getRepr() = term and isCandidateState(s) and result = term.getRootTerm() - ) - } - - /** - * Gets a char used for finding possible suffixes inside `root`. - */ - pragma[noinline] - private string relevant(RegExpRoot root) { - root = relevantRoot() and - ( - exists(ascii(result)) and exists(root) - or - exists(InputSymbol s | belongsTo(s, root) | result = intersect(s, _)) - or - // The characters from `hasSimpleRejectEdge`. Only `\n` is really needed (as `\n` is not in the `ascii` relation). - // The three chars must be kept in sync with `hasSimpleRejectEdge`. - result = ["|", "\n", "Z"] and exists(root) - ) - } - - /** - * Holds if there exists a `char` such that there is no edge from `s` labeled `char` in our NFA. - * The NFA does not model reject states, so the above is the same as saying there is a reject edge. - */ - private predicate hasRejectEdge(State s) { - hasSimpleRejectEdge(s) - or - not hasSimpleRejectEdge(s) and - exists(string char | char = relevant(getRoot(s.getRepr())) | not deltaClosedChar(s, char, _)) - } - - /** - * Holds if there is no edge from `s` labeled with "|", "\n", or "Z" in our NFA. - * This predicate is used as a cheap pre-processing to speed up `hasRejectEdge`. - */ - private predicate hasSimpleRejectEdge(State s) { - // The three chars were chosen arbitrarily. The three chars must be kept in sync with `relevant`. - exists(string char | char = ["|", "\n", "Z"] | not deltaClosedChar(s, char, _)) - } - - /** - * Gets a state that can be reached from pumpable `fork` consuming all - * chars in `w` any number of times followed by the first `i+1` characters of `w`. - */ - pragma[noopt] - private State process(State fork, string w, int i) { - exists(State prev | prev = getProcessPrevious(fork, i, w) | - not prev = acceptsAnySuffix() and // we stop `process(..)` early if we can. If the successor accepts any suffix, then we know it can never be rejected. - exists(string char, InputSymbol sym | - char = w.charAt(i) and - deltaClosed(prev, sym, result) and - // noopt to prevent joining `prev` with all possible `chars` that could transition away from `prev`. - // Instead only join with the set of `chars` where a relevant `InputSymbol` has already been found. - sym = getAProcessInputSymbol(char) - ) - ) - } - - /** - * Gets a state that can be reached from pumpable `fork` consuming all - * chars in `w` any number of times followed by the first `i` characters of `w`. - */ - private State getProcessPrevious(State fork, int i, string w) { - isReDoSCandidate(fork, w) and - ( - i = 0 and result = fork - or - result = process(fork, w, i - 1) - or - // repeat until fixpoint - i = 0 and - result = process(fork, w, w.length() - 1) - ) - } - - /** - * Gets an InputSymbol that matches `char`. - * The predicate is specialized to only have a result for the `char`s that are relevant for the `process` predicate. - */ - private InputSymbol getAProcessInputSymbol(string char) { - char = getAProcessChar() and - result = getAnInputSymbolMatching(char) - } - - /** - * Gets a `char` that occurs in a `pump` string. - */ - private string getAProcessChar() { result = any(string s | isReDoSCandidate(_, s)).charAt(_) } - } - - /** - * Holds if `term` may cause superlinear backtracking on strings containing many repetitions of `pump`. - * Gets the shortest string that causes superlinear backtracking. - */ - private predicate isReDoSAttackable(RegExpTerm term, string pump, State s) { - exists(int i, string c | s = Match(term, i) | - c = - min(string w | - isCandidate(s, w) and - SuffixConstruction::reachesOnlyRejectableSuffixes(s, w) - | - w order by w.length(), w - ) and - pump = escape(rotate(c, i)) - ) - } - - /** - * Holds if the state `s` (represented by the term `t`) can have backtracking with repetitions of `pump`. - * - * `prefixMsg` contains a friendly message for a prefix that reaches `s` (or `prefixMsg` is the empty string if the prefix is empty or if no prefix could be found). - */ - predicate hasReDoSResult(RegExpTerm t, string pump, State s, string prefixMsg) { - isReDoSAttackable(t, pump, s) and - ( - prefixMsg = "starting with '" + escape(Prefix::prefix(s)) + "' and " and - not Prefix::prefix(s) = "" - or - Prefix::prefix(s) = "" and prefixMsg = "" - or - not exists(Prefix::prefix(s)) and prefixMsg = "" - ) - } - - /** - * Gets the result of backslash-escaping newlines, carriage-returns and - * backslashes in `s`. - */ - bindingset[s] - private string escape(string s) { - result = - s.replaceAll("\\", "\\\\") - .replaceAll("\n", "\\n") - .replaceAll("\r", "\\r") - .replaceAll("\t", "\\t") - } - - /** - * Gets `str` with the last `i` characters moved to the front. - * - * We use this to adjust the pump string to match with the beginning of - * a RegExpTerm, so it doesn't start in the middle of a constant. - */ - bindingset[str, i] - private string rotate(string str, int i) { - result = str.suffix(str.length() - i) + str.prefix(str.length() - i) - } -} - -/** - * A module that describes a tree where each node has one or more associated characters, also known as a trie. - * The root node has no associated character. - * This module is a signature used in `Concretizer`. - */ -signature module CharTree { - /** A node in the tree. */ - class CharNode; - - /** Gets the previous node in the tree from `t`. */ - CharNode getPrev(CharNode t); - - /** - * Holds if `n` is at the end of a tree. I.e. a node that should have a result in the `Concretizer` module. - * Such a node can still have children. - */ - predicate isARelevantEnd(CharNode n); - - /** Gets a char associated with `t`. */ - string getChar(CharNode t); -} - -/** - * Implements an algorithm for computing all possible strings - * from following a tree of nodes (as described in `CharTree`). - * - * The string is build using one big concat, where all the chars are computed first. - * See `concretize`. - */ -module Concretizer { - private class Node = Impl::CharNode; - - private predicate getPrev = Impl::getPrev/1; - - private predicate isARelevantEnd = Impl::isARelevantEnd/1; - - private predicate getChar = Impl::getChar/1; - - /** Holds if `n` is on a path from the root to a leaf, and is therefore relevant for the results in `concretize`. */ - private predicate isRelevant(Node n) { - isARelevantEnd(n) - or - exists(Node succ | isRelevant(succ) | n = getPrev(succ)) - } - - /** Holds if `n` is a root with no predecessors. */ - private predicate isRoot(Node n) { not exists(getPrev(n)) } - - /** Gets the distance from a root to `n`. */ - private int nodeDepth(Node n) { - result = 0 and isRoot(n) - or - isRelevant(n) and - exists(Node prev | result = nodeDepth(prev) + 1 | prev = getPrev(n)) - } - - /** Gets an ancestor of `end`, where `end` is a node that should have a result in `concretize`. */ - private Node getAnAncestor(Node end) { isARelevantEnd(end) and result = getPrev*(end) } - - /** Gets the `i`th character on the path from the root to `n`. */ - pragma[noinline] - private string getPrefixChar(Node n, int i) { - exists(Node ancestor | - result = getChar(ancestor) and - ancestor = getAnAncestor(n) and - i = nodeDepth(ancestor) - ) - } - - /** Gets a string corresponding to `node`. */ - language[monotonicAggregates] - string concretize(Node n) { - result = strictconcat(int i | exists(getPrefixChar(n, i)) | getPrefixChar(n, i) order by i) - } -} +private import semmle.python.RegexTreeView::RegexTreeView as TreeView +// NfaUtils should be used directly from the shared pack, and not from this file. +deprecated private import codeql.regex.nfa.NfaUtils::Make as Dep +import Dep diff --git a/python/ql/lib/semmle/python/security/regexp/NfaUtilsSpecific.qll b/python/ql/lib/semmle/python/security/regexp/NfaUtilsSpecific.qll deleted file mode 100644 index 70ed5bcedad..00000000000 --- a/python/ql/lib/semmle/python/security/regexp/NfaUtilsSpecific.qll +++ /dev/null @@ -1,75 +0,0 @@ -/** - * Provides Python-specific definitions for use in the NfaUtils module. - */ - -import python -import semmle.python.RegexTreeView - -/** - * Holds if `term` is an escape class representing e.g. `\d`. - * `clazz` is which character class it represents, e.g. "d" for `\d`. - */ -predicate isEscapeClass(RegExpTerm term, string clazz) { - exists(RegExpCharacterClassEscape escape | term = escape | escape.getValue() = clazz) -} - -/** - * Holds if `term` is a possessive quantifier. - * As python's regexes do not support possessive quantifiers, this never holds, but is used by the shared library. - */ -predicate isPossessive(RegExpQuantifier term) { none() } - -/** - * Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against. - * Not yet implemented for Python. - */ -predicate matchesAnyPrefix(RegExpTerm term) { any() } - -/** - * Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against. - * Not yet implemented for Python. - */ -predicate matchesAnySuffix(RegExpTerm term) { any() } - -/** - * Holds if the regular expression should not be considered. - * - * We make the pragmatic performance optimization to ignore regular expressions in files - * that does not belong to the project code (such as installed dependencies). - */ -predicate isExcluded(RegExpParent parent) { - not exists(parent.getRegex().getLocation().getFile().getRelativePath()) - or - // Regexes with many occurrences of ".*" may cause the polynomial ReDoS computation to explode, so - // we explicitly exclude these. - count(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10 -} - -/** - * A module containing predicates for determining which flags a regular expression have. - */ -module RegExpFlags { - /** - * Holds if `root` has the `i` flag for case-insensitive matching. - */ - predicate isIgnoreCase(RegExpTerm root) { - root.isRootTerm() and - root.getLiteral().isIgnoreCase() - } - - /** - * Gets the flags for `root`, or the empty string if `root` has no flags. - */ - string getFlags(RegExpTerm root) { - root.isRootTerm() and - result = root.getLiteral().getFlags() - } - - /** - * Holds if `root` has the `s` flag for multi-line matching. - */ - predicate isDotAll(RegExpTerm root) { - root.isRootTerm() and - root.getLiteral().isDotAll() - } -} diff --git a/python/ql/lib/semmle/python/security/regexp/RegexpMatching.qll b/python/ql/lib/semmle/python/security/regexp/RegexpMatching.qll index e2c75ff980b..d73a67add16 100644 --- a/python/ql/lib/semmle/python/security/regexp/RegexpMatching.qll +++ b/python/ql/lib/semmle/python/security/regexp/RegexpMatching.qll @@ -3,155 +3,7 @@ * and for testing which capture groups are filled when a particular regexp matches a string. */ -import NfaUtils - -/** A root term */ -class RootTerm extends RegExpTerm { - RootTerm() { this.isRootTerm() } -} - -/** - * Holds if it should be tested whether `root` matches `str`. - * - * If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor. - * E.g. a regular expression `/foo$/` will match any string that ends with "foo", - * but if `ignorePrefix` is true, it will only match "foo". - * - * If `testWithGroups` is true, then the `RegexpMatching::fillsCaptureGroup` predicate can be used to determine which capture - * groups are filled by a string. - */ -signature predicate isRegexpMatchingCandidateSig( - RootTerm root, string str, boolean ignorePrefix, boolean testWithGroups -); - -/** - * A module for determining if a regexp matches a given string, - * and reasoning about which capture groups are filled by a given string. - * - * The module parameter `isCandidate` determines which strings should be tested, - * and the results can be read from the `matches` and `fillsCaptureGroup` predicates. - */ -module RegexpMatching { - /** - * Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`. - * The regular expression is modeled as a non-determistic finite automaton, - * the regular expression can therefore be in multiple states after matching a character. - * - * It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state. - */ - private State getAState(RootTerm reg, int i, string str, boolean ignorePrefix) { - // start state, the -1 position before any chars have been matched - i = -1 and - isCandidate(reg, str, ignorePrefix, _) and - result.getRepr().getRootTerm() = reg and - isStartState(result) - or - // recursive case - result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix) - } - - /** - * Gets the next state after the `prev` state from `reg`. - * `prev` is the state after matching `fromIndex` chars in `str`, - * and the result is the state after matching `toIndex` chars in `str`. - * - * This predicate is used as a step relation in the forwards search (`getAState`), - * and also as a step relation in the later backwards search (`getAStateThatReachesAccept`). - */ - private State getAStateAfterMatching( - RootTerm reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix - ) { - // the basic recursive case - outlined into a noopt helper to make performance work out. - result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix) - or - // we can skip past word boundaries if the next char is a non-word char. - fromIndex = toIndex and - prev.getRepr() instanceof RegExpWordBoundary and - prev = getAState(reg, toIndex, str, ignorePrefix) and - after(prev.getRepr()) = result and - str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char. - } - - pragma[noopt] - private State getAStateAfterMatchingAux( - RootTerm reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix - ) { - prev = getAState(reg, fromIndex, str, ignorePrefix) and - fromIndex = toIndex - 1 and - exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and - not discardedPrefixStep(prev, result, ignorePrefix) - } - - /** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */ - private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) { - prev = mkMatch(any(RegExpRoot r)) and - ignorePrefix = true and - next = prev - } - - // The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`. - private predicate specializedDeltaClosed(State prev, string char, State next) { - deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next) - } - - // The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`. - pragma[noinline] - private InputSymbol specializedGetAnInputSymbolMatching(string char) { - exists(string s, RootTerm r | isCandidate(r, s, _, _) | char = s.charAt(_)) and - result = getAnInputSymbolMatching(char) - } - - /** - * Gets the `i`th state on a path to the accepting state when `reg` matches `str`. - * Starts with an accepting state as found by `getAState` and searches backwards - * to the start state through the reachable states (as found by `getAState`). - * - * This predicate satisfies the invariant that the result state can be reached with `i` steps from a start state, - * and an accepting state can be found after (`str.length() - 1 - i`) steps from the result. - * The result state is therefore always on a valid path where `reg` accepts `str`. - * - * This predicate is only used to find which capture groups a regular expression has filled, - * and thus the search is only performed for the strings in the `testWithGroups(..)` predicate. - */ - private State getAStateThatReachesAccept(RootTerm reg, int i, string str, boolean ignorePrefix) { - // base case, reaches an accepting state from the last state in `getAState(..)` - isCandidate(reg, str, ignorePrefix, true) and - i = str.length() - 1 and - result = getAState(reg, i, str, ignorePrefix) and - epsilonSucc*(result) = Accept(_) - or - // recursive case. `next` is the next state to be matched after matching `prev`. - // this predicate is doing a backwards search, so `prev` is the result we are looking for. - exists(State next, State prev, int fromIndex, int toIndex | - next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and - next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and - i = fromIndex and - result = prev - ) - } - - /** Gets the capture group number that `term` belongs to. */ - private int group(RegExpTerm term) { - exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp) - } - - /** - * Holds if `reg` matches `str`, where `str` is in the `isCandidate` predicate. - */ - predicate matches(RootTerm reg, string str) { - exists(State state | state = getAState(reg, str.length() - 1, str, _) | - epsilonSucc*(state) = Accept(_) - ) - } - - /** - * Holds if matching `str` against `reg` may fill capture group number `g`. - * Only holds if `str` is in the `testWithGroups` predicate. - */ - predicate fillsCaptureGroup(RootTerm reg, string str, int g) { - exists(State s | - s = getAStateThatReachesAccept(reg, _, str, _) and - g = group(s.getRepr()) - ) - } -} +private import semmle.python.RegexTreeView::RegexTreeView as TreeView +// RegexpMatching should be used directly from the shared pack, and not from this file. +deprecated import codeql.regex.nfa.RegexpMatching::Make as Dep +import Dep diff --git a/python/ql/lib/semmle/python/security/regexp/SuperlinearBackTracking.qll b/python/ql/lib/semmle/python/security/regexp/SuperlinearBackTracking.qll index 14a69dc0644..6eca3722e09 100644 --- a/python/ql/lib/semmle/python/security/regexp/SuperlinearBackTracking.qll +++ b/python/ql/lib/semmle/python/security/regexp/SuperlinearBackTracking.qll @@ -1,11 +1,4 @@ /** - * Provides classes for working with regular expressions that can - * perform backtracking in superlinear time. - */ - -import NfaUtils - -/* * This module implements the analysis described in the paper: * Valentin Wustholz, Oswaldo Olivo, Marijn J. H. Heule, and Isil Dillig: * Static Detection of DoS Vulnerabilities in @@ -42,377 +35,7 @@ import NfaUtils * It also doesn't find all transitions in the product automaton, which can cause false negatives. */ -/** - * Gets any root (start) state of a regular expression. - */ -private State getRootState() { result = mkMatch(any(RegExpRoot r)) } - -private newtype TStateTuple = - MkStateTuple(State q1, State q2, State q3) { - // starts at (pivot, pivot, succ) - isStartLoops(q1, q3) and q1 = q2 - or - step(_, _, _, _, q1, q2, q3) and FeasibleTuple::isFeasibleTuple(q1, q2, q3) - } - -/** - * A state in the product automaton. - * The product automaton contains 3-tuples of states. - * - * We lazily only construct those states that we are actually - * going to need. - * Either a start state `(pivot, pivot, succ)`, or a state - * where there exists a transition from an already existing state. - * - * The exponential variant of this query (`js/redos`) uses an optimization - * trick where `q1 <= q2`. This trick cannot be used here as the order - * of the elements matter. - */ -class StateTuple extends TStateTuple { - State q1; - State q2; - State q3; - - StateTuple() { this = MkStateTuple(q1, q2, q3) } - - /** - * Gest a string representation of this tuple. - */ - string toString() { result = "(" + q1 + ", " + q2 + ", " + q3 + ")" } - - /** - * Holds if this tuple is `(r1, r2, r3)`. - */ - pragma[noinline] - predicate isTuple(State r1, State r2, State r3) { r1 = q1 and r2 = q2 and r3 = q3 } -} - -/** - * A module for determining feasible tuples for the product automaton. - * - * The implementation is split into many predicates for performance reasons. - */ -private module FeasibleTuple { - /** - * Holds if the tuple `(r1, r2, r3)` might be on path from a start-state to an end-state in the product automaton. - */ - pragma[inline] - predicate isFeasibleTuple(State r1, State r2, State r3) { - // The first element is either inside a repetition (or the start state itself) - isRepetitionOrStart(r1) and - // The last element is inside a repetition - stateInsideRepetition(r3) and - // The states are reachable in the NFA in the order r1 -> r2 -> r3 - delta+(r1) = r2 and - delta+(r2) = r3 and - // The first element can reach a beginning (the "pivot" state in a `(pivot, succ)` pair). - canReachABeginning(r1) and - // The last element can reach a target (the "succ" state in a `(pivot, succ)` pair). - canReachATarget(r3) - } - - /** - * Holds if `s` is either inside a repetition, or is the start state (which is a repetition). - */ - pragma[noinline] - private predicate isRepetitionOrStart(State s) { stateInsideRepetition(s) or s = getRootState() } - - /** - * Holds if state `s` might be inside a backtracking repetition. - */ - pragma[noinline] - private predicate stateInsideRepetition(State s) { - s.getRepr().getParent*() instanceof InfiniteRepetitionQuantifier - } - - /** - * Holds if there exists a path in the NFA from `s` to a "pivot" state - * (from a `(pivot, succ)` pair that starts the search). - */ - pragma[noinline] - private predicate canReachABeginning(State s) { - delta+(s) = any(State pivot | isStartLoops(pivot, _)) - } - - /** - * Holds if there exists a path in the NFA from `s` to a "succ" state - * (from a `(pivot, succ)` pair that starts the search). - */ - pragma[noinline] - private predicate canReachATarget(State s) { delta+(s) = any(State succ | isStartLoops(_, succ)) } -} - -/** - * Holds if `pivot` and `succ` are a pair of loops that could be the beginning of a quadratic blowup. - * - * There is a slight implementation difference compared to the paper: this predicate requires that `pivot != succ`. - * The case where `pivot = succ` causes exponential backtracking and is handled by the `js/redos` query. - */ -predicate isStartLoops(State pivot, State succ) { - pivot != succ and - succ.getRepr() instanceof InfiniteRepetitionQuantifier and - delta+(pivot) = succ and - ( - pivot.getRepr() instanceof InfiniteRepetitionQuantifier - or - pivot = mkMatch(any(RegExpRoot root)) - ) -} - -/** - * Gets a state for which there exists a transition in the NFA from `s'. - */ -State delta(State s) { delta(s, _, result) } - -/** - * Holds if there are transitions from the components of `q` to the corresponding - * components of `r` labelled with `s1`, `s2`, and `s3`, respectively. - */ -pragma[noinline] -predicate step(StateTuple q, InputSymbol s1, InputSymbol s2, InputSymbol s3, StateTuple r) { - exists(State r1, State r2, State r3 | - step(q, s1, s2, s3, r1, r2, r3) and r = MkStateTuple(r1, r2, r3) - ) -} - -/** - * Holds if there are transitions from the components of `q` to `r1`, `r2`, and `r3 - * labelled with `s1`, `s2`, and `s3`, respectively. - */ -pragma[noopt] -predicate step( - StateTuple q, InputSymbol s1, InputSymbol s2, InputSymbol s3, State r1, State r2, State r3 -) { - exists(State q1, State q2, State q3 | q.isTuple(q1, q2, q3) | - deltaClosed(q1, s1, r1) and - deltaClosed(q2, s2, r2) and - deltaClosed(q3, s3, r3) and - // use noopt to force the join on `getAThreewayIntersect` to happen last. - exists(getAThreewayIntersect(s1, s2, s3)) - ) -} - -/** - * Gets a char that is matched by all the edges `s1`, `s2`, and `s3`. - * - * The result is not complete, and might miss some combination of edges that share some character. - */ -pragma[noinline] -string getAThreewayIntersect(InputSymbol s1, InputSymbol s2, InputSymbol s3) { - result = minAndMaxIntersect(s1, s2) and result = [intersect(s2, s3), intersect(s1, s3)] - or - result = minAndMaxIntersect(s1, s3) and result = [intersect(s2, s3), intersect(s1, s2)] - or - result = minAndMaxIntersect(s2, s3) and result = [intersect(s1, s2), intersect(s1, s3)] -} - -/** - * Gets the minimum and maximum characters that intersect between `a` and `b`. - * This predicate is used to limit the size of `getAThreewayIntersect`. - */ -pragma[noinline] -string minAndMaxIntersect(InputSymbol a, InputSymbol b) { - result = [min(intersect(a, b)), max(intersect(a, b))] -} - -private newtype TTrace = - Nil() or - Step(InputSymbol s1, InputSymbol s2, InputSymbol s3, TTrace t) { - isReachableFromStartTuple(_, _, t, s1, s2, s3, _, _) - } - -/** - * A list of tuples of input symbols that describe a path in the product automaton - * starting from some start state. - */ -class Trace extends TTrace { - /** - * Gets a string representation of this Trace that can be used for debug purposes. - */ - string toString() { - this = Nil() and result = "Nil()" - or - exists(InputSymbol s1, InputSymbol s2, InputSymbol s3, Trace t | this = Step(s1, s2, s3, t) | - result = "Step(" + s1 + ", " + s2 + ", " + s3 + ", " + t + ")" - ) - } -} - -/** - * Holds if there exists a transition from `r` to `q` in the product automaton. - * Notice that the arguments are flipped, and thus the direction is backwards. - */ -pragma[noinline] -predicate tupleDeltaBackwards(StateTuple q, StateTuple r) { step(r, _, _, _, q) } - -/** - * Holds if `tuple` is an end state in our search. - * That means there exists a pair of loops `(pivot, succ)` such that `tuple = (pivot, succ, succ)`. - */ -predicate isEndTuple(StateTuple tuple) { tuple = getAnEndTuple(_, _) } - -/** - * Gets the minimum length of a path from `r` to some an end state `end`. - * - * The implementation searches backwards from the end-tuple. - * This approach was chosen because it is way more efficient if the first predicate given to `shortestDistances` is small. - * The `end` argument must always be an end state. - */ -int distBackFromEnd(StateTuple r, StateTuple end) = - shortestDistances(isEndTuple/1, tupleDeltaBackwards/2)(end, r, result) - -/** - * Holds if there exists a pair of repetitions `(pivot, succ)` in the regular expression such that: - * `tuple` is reachable from `(pivot, pivot, succ)` in the product automaton, - * and there is a distance of `dist` from `tuple` to the nearest end-tuple `(pivot, succ, succ)`, - * and a path from a start-state to `tuple` follows the transitions in `trace`. - */ -private predicate isReachableFromStartTuple( - State pivot, State succ, StateTuple tuple, Trace trace, int dist -) { - exists(InputSymbol s1, InputSymbol s2, InputSymbol s3, Trace v | - isReachableFromStartTuple(pivot, succ, v, s1, s2, s3, tuple, dist) and - trace = Step(s1, s2, s3, v) - ) -} - -private predicate isReachableFromStartTuple( - State pivot, State succ, Trace trace, InputSymbol s1, InputSymbol s2, InputSymbol s3, - StateTuple tuple, int dist -) { - // base case. - exists(State q1, State q2, State q3 | - isStartLoops(pivot, succ) and - step(MkStateTuple(pivot, pivot, succ), s1, s2, s3, tuple) and - tuple = MkStateTuple(q1, q2, q3) and - trace = Nil() and - dist = distBackFromEnd(tuple, MkStateTuple(pivot, succ, succ)) - ) - or - // recursive case - exists(StateTuple p | - isReachableFromStartTuple(pivot, succ, p, trace, dist + 1) and - dist = distBackFromEnd(tuple, MkStateTuple(pivot, succ, succ)) and - step(p, s1, s2, s3, tuple) - ) -} - -/** - * Gets the tuple `(pivot, succ, succ)` from the product automaton. - */ -StateTuple getAnEndTuple(State pivot, State succ) { - isStartLoops(pivot, succ) and - result = MkStateTuple(pivot, succ, succ) -} - -/** An implementation of a chain containing chars for use by `Concretizer`. */ -private module CharTreeImpl implements CharTree { - class CharNode = Trace; - - CharNode getPrev(CharNode t) { t = Step(_, _, _, result) } - - /** Holds if `n` is used in `isPumpable`. */ - predicate isARelevantEnd(CharNode n) { - exists(State pivot, State succ | - isReachableFromStartTuple(pivot, succ, getAnEndTuple(pivot, succ), n, _) - ) - } - - string getChar(CharNode t) { - exists(InputSymbol s1, InputSymbol s2, InputSymbol s3 | t = Step(s1, s2, s3, _) | - result = getAThreewayIntersect(s1, s2, s3) - ) - } -} - -/** - * Holds if matching repetitions of `pump` can: - * 1) Transition from `pivot` back to `pivot`. - * 2) Transition from `pivot` to `succ`. - * 3) Transition from `succ` to `succ`. - * - * From theorem 3 in the paper linked in the top of this file we can therefore conclude that - * the regular expression has polynomial backtracking - if a rejecting suffix exists. - * - * This predicate is used by `SuperLinearReDoSConfiguration`, and the final results are - * available in the `hasReDoSResult` predicate. - */ -predicate isPumpable(State pivot, State succ, string pump) { - exists(StateTuple q, Trace t | - isReachableFromStartTuple(pivot, succ, q, t, _) and - q = getAnEndTuple(pivot, succ) and - pump = Concretizer ::concretize(t) - ) -} - -/** - * Holds if states starting in `state` can have polynomial backtracking with the string `pump`. - */ -predicate isReDoSCandidate(State state, string pump) { isPumpable(_, state, pump) } - -/** - * Holds if repetitions of `pump` at `t` will cause polynomial backtracking. - */ -predicate polynomialReDoS(RegExpTerm t, string pump, string prefixMsg, RegExpTerm prev) { - exists(State s, State pivot | - ReDoSPruning ::hasReDoSResult(t, pump, s, prefixMsg) and - isPumpable(pivot, s, _) and - prev = pivot.getRepr() - ) -} - -/** - * Gets a message for why `term` can cause polynomial backtracking. - */ -string getReasonString(RegExpTerm term, string pump, string prefixMsg, RegExpTerm prev) { - polynomialReDoS(term, pump, prefixMsg, prev) and - result = - "Strings " + prefixMsg + "with many repetitions of '" + pump + - "' can start matching anywhere after the start of the preceeding " + prev -} - -/** - * A term that may cause a regular expression engine to perform a - * polynomial number of match attempts, relative to the input length. - */ -class PolynomialBackTrackingTerm extends InfiniteRepetitionQuantifier { - string reason; - string pump; - string prefixMsg; - RegExpTerm prev; - - PolynomialBackTrackingTerm() { - reason = getReasonString(this, pump, prefixMsg, prev) and - // there might be many reasons for this term to have polynomial backtracking - we pick the shortest one. - reason = min(string msg | msg = getReasonString(this, _, _, _) | msg order by msg.length(), msg) - } - - /** - * Holds if all non-empty successors to the polynomial backtracking term matches the end of the line. - */ - predicate isAtEndLine() { - forall(RegExpTerm succ | this.getSuccessor+() = succ and not matchesEpsilon(succ) | - succ instanceof RegExpDollar - ) - } - - /** - * Gets the string that should be repeated to cause this regular expression to perform polynomially. - */ - string getPumpString() { result = pump } - - /** - * Gets a message for which prefix a matching string must start with for this term to cause polynomial backtracking. - */ - string getPrefixMessage() { result = prefixMsg } - - /** - * Gets a predecessor to `this`, which also loops on the pump string, and thereby causes polynomial backtracking. - */ - RegExpTerm getPreviousLoop() { result = prev } - - /** - * Gets the reason for the number of match attempts. - */ - string getReason() { result = reason } -} +private import semmle.python.RegexTreeView::RegexTreeView as TreeView +// SuperlinearBackTracking should be used directly from the shared pack, and not from this file. +deprecated private import codeql.regex.nfa.SuperlinearBackTracking::Make as Dep +import Dep diff --git a/python/ql/src/Security/CWE-020/OverlyLargeRange.ql b/python/ql/src/Security/CWE-020/OverlyLargeRange.ql index b4d2caf5e80..6bf7f41d8ed 100644 --- a/python/ql/src/Security/CWE-020/OverlyLargeRange.ql +++ b/python/ql/src/Security/CWE-020/OverlyLargeRange.ql @@ -12,8 +12,9 @@ * external/cwe/cwe-020 */ -import semmle.python.security.OverlyLargeRangeQuery +private import semmle.python.RegexTreeView::RegexTreeView as TreeView +import codeql.regex.OverlyLargeRangeQuery::Make -from RegExpCharacterRange range, string reason +from TreeView::RegExpCharacterRange range, string reason where problem(range, reason) select range, "Suspicious character range that " + reason + "." diff --git a/python/ql/src/Security/CWE-116/BadTagFilter.ql b/python/ql/src/Security/CWE-116/BadTagFilter.ql index 654df65275b..afcf73f357a 100644 --- a/python/ql/src/Security/CWE-116/BadTagFilter.ql +++ b/python/ql/src/Security/CWE-116/BadTagFilter.ql @@ -14,7 +14,8 @@ * external/cwe/cwe-186 */ -import semmle.python.security.BadTagFilterQuery +private import semmle.python.RegexTreeView::RegexTreeView as TreeView +import codeql.regex.nfa.BadTagFilterQuery::Make from HtmlMatchingRegExp regexp, string msg where msg = min(string m | isBadRegexpFilter(regexp, m) | m order by m.length(), m) // there might be multiple, we arbitrarily pick the shortest one diff --git a/python/ql/src/Security/CWE-730/PolynomialReDoS.ql b/python/ql/src/Security/CWE-730/PolynomialReDoS.ql index 1637686c1d8..1b315c651c3 100644 --- a/python/ql/src/Security/CWE-730/PolynomialReDoS.ql +++ b/python/ql/src/Security/CWE-730/PolynomialReDoS.ql @@ -14,7 +14,6 @@ */ import python -import semmle.python.security.regexp.SuperlinearBackTracking import semmle.python.security.dataflow.PolynomialReDoSQuery import DataFlow::PathGraph diff --git a/python/ql/src/Security/CWE-730/ReDoS.ql b/python/ql/src/Security/CWE-730/ReDoS.ql index 0e66d3cdb79..4ba35c598da 100644 --- a/python/ql/src/Security/CWE-730/ReDoS.ql +++ b/python/ql/src/Security/CWE-730/ReDoS.ql @@ -14,10 +14,10 @@ * external/cwe/cwe-400 */ -import python -import semmle.python.security.regexp.ExponentialBackTracking +private import semmle.python.RegexTreeView::RegexTreeView as TreeView +import codeql.regex.nfa.ExponentialBackTracking::Make -from RegExpTerm t, string pump, State s, string prefixMsg +from TreeView::RegExpTerm t, string pump, State s, string prefixMsg where hasReDoSResult(t, pump, s, prefixMsg) and // exclude verbose mode regexes for now diff --git a/python/ql/test/experimental/dataflow/basic/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/basic/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/basic/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/basic/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/calls/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/calls/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/calls/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/calls/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/consistency/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/consistency/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/consistency/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/consistency/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/coverage/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/coverage/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/coverage/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/coverage/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/fieldflow/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/fieldflow/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/fieldflow/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/fieldflow/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/global-flow/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/global-flow/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/global-flow/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/global-flow/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/match/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/match/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/match/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/match/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/pep_328/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/pep_328/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/pep_328/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/pep_328/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/regression/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/regression/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/regression/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/regression/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/strange-essaflow/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/strange-essaflow/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/strange-essaflow/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/strange-essaflow/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/tainttracking/basic/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/tainttracking/basic/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/tainttracking/basic/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/tainttracking/basic/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/tainttracking/commonSanitizer/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/tainttracking/commonSanitizer/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/tainttracking/commonSanitizer/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/tainttracking/commonSanitizer/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/tainttracking/customSanitizer/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/tainttracking/customSanitizer/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/tainttracking/customSanitizer/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/tainttracking/customSanitizer/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep-py3/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep-py3/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep-py3/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep-py3/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/tainttracking/defaultAdditionalTaintStep/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/tainttracking/unwanted-global-flow/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/tainttracking/unwanted-global-flow/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/tainttracking/unwanted-global-flow/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/tainttracking/unwanted-global-flow/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/typetracking/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/typetracking/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/typetracking/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/typetracking/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/experimental/dataflow/variable-capture/dataflow-consistency.expected b/python/ql/test/experimental/dataflow/variable-capture/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/experimental/dataflow/variable-capture/dataflow-consistency.expected +++ b/python/ql/test/experimental/dataflow/variable-capture/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/library-tests/ApiGraphs/py3/dataflow-consistency.expected b/python/ql/test/library-tests/ApiGraphs/py3/dataflow-consistency.expected index 9fedaf9f663..8f4dbd04742 100644 --- a/python/ql/test/library-tests/ApiGraphs/py3/dataflow-consistency.expected +++ b/python/ql/test/library-tests/ApiGraphs/py3/dataflow-consistency.expected @@ -6,6 +6,8 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/library-tests/frameworks/django-orm/dataflow-consistency.expected b/python/ql/test/library-tests/frameworks/django-orm/dataflow-consistency.expected index 9fedaf9f663..06a8a168262 100644 --- a/python/ql/test/library-tests/frameworks/django-orm/dataflow-consistency.expected +++ b/python/ql/test/library-tests/frameworks/django-orm/dataflow-consistency.expected @@ -6,6 +6,56 @@ uniqueNodeToString missingToString parameterCallable localFlowIsLocal +readStepIsLocal +storeStepIsLocal +| testapp/orm_form_test.py:6:1:6:28 | [orm-model] Class MyModel | testapp/tests.py:83:16:83:36 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_form_test.py:6:1:6:28 | [orm-model] Class MyModel | testapp/tests.py:84:16:84:43 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_form_test.py:6:1:6:28 | [orm-model] Class MyModel | testapp/tests.py:85:16:85:36 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:45:15:45:20 | ControlFlowNode for SOURCE | testapp/orm_inheritance.py:29:1:29:25 | [orm-model] Class Book | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:76:15:76:20 | ControlFlowNode for SOURCE | testapp/orm_inheritance.py:29:1:29:25 | [orm-model] Class Book | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:76:15:76:20 | ControlFlowNode for SOURCE | testapp/orm_inheritance.py:33:1:33:25 | [orm-model] Class PhysicalBook | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:77:27:77:32 | ControlFlowNode for SOURCE | testapp/orm_inheritance.py:33:1:33:25 | [orm-model] Class PhysicalBook | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:78:35:78:40 | ControlFlowNode for SOURCE | testapp/orm_inheritance.py:33:1:33:25 | [orm-model] Class PhysicalBook | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:93:15:93:26 | ControlFlowNode for Str | testapp/orm_inheritance.py:29:1:29:25 | [orm-model] Class Book | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:93:15:93:26 | ControlFlowNode for Str | testapp/orm_inheritance.py:38:1:38:18 | [orm-model] Class EBook | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:94:23:94:28 | ControlFlowNode for Str | testapp/orm_inheritance.py:38:1:38:18 | [orm-model] Class EBook | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:95:35:95:40 | ControlFlowNode for Str | testapp/orm_inheritance.py:38:1:38:18 | [orm-model] Class EBook | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:133:15:133:20 | ControlFlowNode for SOURCE | testapp/orm_inheritance.py:117:1:117:33 | [orm-model] Class PolyBook | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:167:15:167:20 | ControlFlowNode for SOURCE | testapp/orm_inheritance.py:117:1:117:33 | [orm-model] Class PolyBook | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:167:15:167:20 | ControlFlowNode for SOURCE | testapp/orm_inheritance.py:121:1:121:33 | [orm-model] Class PolyPhysicalBook | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:168:27:168:32 | ControlFlowNode for SOURCE | testapp/orm_inheritance.py:121:1:121:33 | [orm-model] Class PolyPhysicalBook | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:169:35:169:40 | ControlFlowNode for SOURCE | testapp/orm_inheritance.py:121:1:121:33 | [orm-model] Class PolyPhysicalBook | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:183:15:183:26 | ControlFlowNode for Str | testapp/orm_inheritance.py:117:1:117:33 | [orm-model] Class PolyBook | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:183:15:183:26 | ControlFlowNode for Str | testapp/orm_inheritance.py:126:1:126:26 | [orm-model] Class PolyEBook | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:184:23:184:28 | ControlFlowNode for Str | testapp/orm_inheritance.py:126:1:126:26 | [orm-model] Class PolyEBook | Store step does not preserve enclosing callable. | +| testapp/orm_inheritance.py:185:35:185:40 | ControlFlowNode for Str | testapp/orm_inheritance.py:126:1:126:26 | [orm-model] Class PolyEBook | Store step does not preserve enclosing callable. | +| testapp/orm_security_tests.py:15:1:15:27 | [orm-model] Class Person | testapp/orm_security_tests.py:42:23:42:42 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:115:41:115:46 | ControlFlowNode for SOURCE | testapp/orm_tests.py:110:1:110:30 | [orm-model] Class TestSave5 | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:131:86:131:91 | ControlFlowNode for SOURCE | testapp/orm_tests.py:126:1:126:30 | [orm-model] Class TestSave6 | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:149:89:149:94 | ControlFlowNode for SOURCE | testapp/orm_tests.py:144:1:144:30 | [orm-model] Class TestSave7 | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:161:1:161:30 | [orm-model] Class TestSave8 | testapp/orm_tests.py:168:22:168:44 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:165:35:165:39 | ControlFlowNode for Str | testapp/orm_tests.py:161:1:161:30 | [orm-model] Class TestSave8 | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:168:58:168:63 | ControlFlowNode for SOURCE | testapp/orm_tests.py:161:1:161:30 | [orm-model] Class TestSave8 | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:184:41:184:45 | ControlFlowNode for Str | testapp/orm_tests.py:177:1:177:30 | [orm-model] Class TestSave9 | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:185:49:185:51 | ControlFlowNode for obj | testapp/orm_tests.py:180:1:180:44 | [orm-model] Class TestSave9WithForeignKey | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:212:55:212:59 | ControlFlowNode for Str | testapp/orm_tests.py:206:1:206:35 | [orm-model] Class save10_Comment | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:239:55:239:59 | ControlFlowNode for Str | testapp/orm_tests.py:233:1:233:35 | [orm-model] Class save11_Comment | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:273:1:273:31 | [orm-model] Class TestSave13 | testapp/orm_tests.py:281:12:281:35 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:294:1:294:29 | [orm-model] Class TestLoad | testapp/orm_tests.py:308:12:308:33 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:294:1:294:29 | [orm-model] Class TestLoad | testapp/orm_tests.py:314:12:314:33 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:294:1:294:29 | [orm-model] Class TestLoad | testapp/orm_tests.py:320:11:320:32 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:294:1:294:29 | [orm-model] Class TestLoad | testapp/orm_tests.py:320:11:320:59 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:294:1:294:29 | [orm-model] Class TestLoad | testapp/orm_tests.py:320:11:320:78 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:294:1:294:29 | [orm-model] Class TestLoad | testapp/orm_tests.py:324:12:324:33 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:294:1:294:29 | [orm-model] Class TestLoad | testapp/orm_tests.py:324:12:324:60 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:294:1:294:29 | [orm-model] Class TestLoad | testapp/orm_tests.py:324:12:324:79 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:294:1:294:29 | [orm-model] Class TestLoad | testapp/orm_tests.py:331:12:331:33 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:294:1:294:29 | [orm-model] Class TestLoad | testapp/orm_tests.py:337:12:337:33 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:294:1:294:29 | [orm-model] Class TestLoad | testapp/orm_tests.py:344:12:344:33 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:294:1:294:29 | [orm-model] Class TestLoad | testapp/orm_tests.py:350:12:350:33 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:294:1:294:29 | [orm-model] Class TestLoad | testapp/orm_tests.py:356:12:356:33 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/orm_tests.py:294:1:294:29 | [orm-model] Class TestLoad | testapp/orm_tests.py:363:9:363:37 | ControlFlowNode for Attribute() | Store step does not preserve enclosing callable. | +| testapp/tests.py:81:33:81:37 | ControlFlowNode for Str | testapp/orm_form_test.py:6:1:6:28 | [orm-model] Class MyModel | Store step does not preserve enclosing callable. | compatibleTypesReflexive unreachableNodeCCtx localCallNodes diff --git a/python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialBackTracking.expected b/python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialBackTracking.expected index 0f422fdc6aa..b72ea986b93 100644 --- a/python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialBackTracking.expected +++ b/python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialBackTracking.expected @@ -1,2 +1,2 @@ | test.py:8:12:8:23 | Str | test.py:8:21:8:23 | \\s+ | Strings with many repetitions of ' ' can start matching anywhere after the start of the preceeding \\s+$ | -| test.py:9:14:9:29 | Str | test.py:9:27:9:29 | \\d+ | Strings with many repetitions of '99' can start matching anywhere after the start of the preceeding \\d+ | +| test.py:9:14:9:29 | Str | test.py:9:27:9:29 | \\d+ | Strings starting with '0.9' and with many repetitions of '99' can start matching anywhere after the start of the preceeding \\d+ | diff --git a/python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialBackTracking.ql b/python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialBackTracking.ql index 8d0aa2f8f91..19c905be1fe 100644 --- a/python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialBackTracking.ql +++ b/python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialBackTracking.ql @@ -1,5 +1,6 @@ import python -import semmle.python.security.regexp.SuperlinearBackTracking +private import semmle.python.RegexTreeView::RegexTreeView as TreeView +import codeql.regex.nfa.SuperlinearBackTracking::Make from PolynomialBackTrackingTerm t -select t.getRegex(), t, t.getReason() +select t.(TreeView::RegExpTerm).getRegex(), t, t.getReason() diff --git a/python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialReDoS.expected b/python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialReDoS.expected index 424b07a0b9d..14817348cb3 100644 --- a/python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialReDoS.expected +++ b/python/ql/test/query-tests/Security/CWE-730-PolynomialReDoS/PolynomialReDoS.expected @@ -16,4 +16,4 @@ nodes subpaths #select | test.py:8:30:8:33 | ControlFlowNode for text | test.py:2:26:2:32 | ControlFlowNode for ImportMember | test.py:8:30:8:33 | ControlFlowNode for text | This $@ that depends on a $@ may run slow on strings with many repetitions of ' '. | test.py:8:21:8:23 | \\s+ | regular expression | test.py:2:26:2:32 | ControlFlowNode for ImportMember | user-provided value | -| test.py:9:32:9:35 | ControlFlowNode for text | test.py:2:26:2:32 | ControlFlowNode for ImportMember | test.py:9:32:9:35 | ControlFlowNode for text | This $@ that depends on a $@ may run slow on strings with many repetitions of '99'. | test.py:9:27:9:29 | \\d+ | regular expression | test.py:2:26:2:32 | ControlFlowNode for ImportMember | user-provided value | +| test.py:9:32:9:35 | ControlFlowNode for text | test.py:2:26:2:32 | ControlFlowNode for ImportMember | test.py:9:32:9:35 | ControlFlowNode for text | This $@ that depends on a $@ may run slow on strings starting with '0.9' and with many repetitions of '99'. | test.py:9:27:9:29 | \\d+ | regular expression | test.py:2:26:2:32 | ControlFlowNode for ImportMember | user-provided value | diff --git a/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected b/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected index bcd6dd36685..b1ed5e099cc 100644 --- a/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected +++ b/python/ql/test/query-tests/Security/CWE-730-ReDoS/ReDoS.expected @@ -2,20 +2,20 @@ | KnownCVEs.py:30:24:31:25 | .* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of ','. | | KnownCVEs.py:35:18:35:81 | ([-/:,#%.'"\\s!\\w]\|\\w-\\w\|'[\\s\\w]+'\\s*\|"[\\s\\w]+"\|\\([\\d,%\\.\\s]+\\))* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '"\\t"'. | | redos.py:6:28:6:42 | (?:__\|[\\s\\S])+? | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '__'. | -| redos.py:6:52:6:68 | (?:\\*\\*\|[\\s\\S])+? | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '**'. | -| redos.py:21:34:21:53 | (?:[^"\\\\]\|\\\\\\\\\|\\\\.)+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\\\\\\\'. | -| redos.py:21:57:21:76 | (?:[^'\\\\]\|\\\\\\\\\|\\\\.)+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\\\\\\\'. | -| redos.py:21:81:21:100 | (?:[^)\\\\]\|\\\\\\\\\|\\\\.)+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\\\\\\\'. | -| redos.py:33:64:33:65 | .* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\|\|\\n'. | +| redos.py:6:52:6:68 | (?:\\*\\*\|[\\s\\S])+? | This part of the regular expression may cause exponential backtracking on strings starting with '*' and containing many repetitions of '**'. | +| redos.py:21:34:21:53 | (?:[^"\\\\]\|\\\\\\\\\|\\\\.)+ | This part of the regular expression may cause exponential backtracking on strings starting with '\\t"' and containing many repetitions of '\\\\\\\\'. | +| redos.py:21:57:21:76 | (?:[^'\\\\]\|\\\\\\\\\|\\\\.)+ | This part of the regular expression may cause exponential backtracking on strings starting with '\\t'' and containing many repetitions of '\\\\\\\\'. | +| redos.py:21:81:21:100 | (?:[^)\\\\]\|\\\\\\\\\|\\\\.)+ | This part of the regular expression may cause exponential backtracking on strings starting with '\\t(' and containing many repetitions of '\\\\\\\\'. | +| redos.py:33:64:33:65 | .* | This part of the regular expression may cause exponential backtracking on strings starting with '!\|\\n-\|\\n' and containing many repetitions of '\|\|\\n'. | | redos.py:38:33:38:42 | (\\\\\\/\|.)*? | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\\\/'. | -| redos.py:43:37:43:38 | .* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '#'. | +| redos.py:43:37:43:38 | .* | This part of the regular expression may cause exponential backtracking on strings starting with '#' and containing many repetitions of '#'. | | redos.py:49:41:49:43 | .*? | This part of the regular expression may cause exponential backtracking on strings starting with '"' and containing many repetitions of '""'. | | redos.py:49:47:49:49 | .*? | This part of the regular expression may cause exponential backtracking on strings starting with ''' and containing many repetitions of ''''. | -| redos.py:54:47:54:49 | .*? | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of ']['. | -| redos.py:54:80:54:82 | .*? | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of ']['. | +| redos.py:54:47:54:49 | .*? | This part of the regular expression may cause exponential backtracking on strings starting with '$[' and containing many repetitions of ']['. | +| redos.py:54:80:54:82 | .*? | This part of the regular expression may cause exponential backtracking on strings starting with '$.$[' and containing many repetitions of ']['. | | redos.py:60:25:60:30 | [a-z]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | | redos.py:61:25:61:30 | [a-z]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | -| redos.py:62:53:62:64 | [a-zA-Z0-9]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. | +| redos.py:62:53:62:64 | [a-zA-Z0-9]+ | This part of the regular expression may cause exponential backtracking on strings starting with '0' and containing many repetitions of '0'. | | redos.py:63:26:63:33 | ([a-z])+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'aa'. | | redos.py:68:26:68:41 | [\\w#:.~>+()\\s-]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\t'. | | redos.py:68:48:68:50 | .*? | This part of the regular expression may cause exponential backtracking on strings starting with '[' and containing many repetitions of ']['. | @@ -51,7 +51,6 @@ | redos.py:196:91:196:92 | ,? | This part of the regular expression may cause exponential backtracking on strings starting with '{[A(A)A: ' and containing many repetitions of ',A: '. | | redos.py:199:25:199:26 | a+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | | redos.py:199:28:199:29 | b+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'b'. | -| redos.py:202:26:202:32 | (a+a?)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | | redos.py:202:27:202:28 | a+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | | redos.py:205:25:205:26 | a+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | | redos.py:211:25:211:26 | a+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | @@ -71,16 +70,16 @@ | redos.py:268:28:268:39 | ([\ufffd\ufffd]\|[\ufffd\ufffd])* | This part of the regular expression may cause exponential backtracking on strings starting with 'foo' and containing many repetitions of '\ufffd'. | | redos.py:271:28:271:41 | ((\ufffd\|\ufffd)\|(\ufffd\|\ufffd))* | This part of the regular expression may cause exponential backtracking on strings starting with 'foo' and containing many repetitions of '\ufffd'. | | redos.py:274:31:274:32 | b+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'b'. | -| redos.py:277:48:277:50 | \\s* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '""\\t0='. | +| redos.py:277:48:277:50 | \\s* | This part of the regular expression may cause exponential backtracking on strings starting with '<0\\t0=' and containing many repetitions of '""\\t0='. | | redos.py:283:26:283:27 | a+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | | redos.py:286:26:286:27 | a+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | | redos.py:292:26:292:27 | a+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | | redos.py:295:35:295:36 | a+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | -| redos.py:301:100:301:101 | e+ | This part of the regular expression may cause exponential backtracking on strings starting with ';00000000000000' and containing many repetitions of 'e'. | +| redos.py:301:100:301:101 | e+ | This part of the regular expression may cause exponential backtracking on strings starting with '00000000000000' and containing many repetitions of 'e'. | | redos.py:304:28:304:29 | c+ | This part of the regular expression may cause exponential backtracking on strings starting with 'ab' and containing many repetitions of 'c'. | | redos.py:307:28:307:30 | \\s+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\t'. | | redos.py:310:26:310:34 | ([^/]\|X)+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'X'. | -| redos.py:313:30:313:34 | [^Y]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'Xx'. | +| redos.py:313:30:313:34 | [^Y]+ | This part of the regular expression may cause exponential backtracking on strings starting with 'x' and containing many repetitions of 'Xx'. | | redos.py:316:25:316:26 | a* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. | | redos.py:319:28:319:33 | [\\w-]* | This part of the regular expression may cause exponential backtracking on strings starting with 'foo' and containing many repetitions of '-'. | | redos.py:322:25:322:29 | (ab)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'ab'. | diff --git a/ql/ql/src/codeql_ql/performance/VarUnusedInDisjunctQuery.qll b/ql/ql/src/codeql_ql/performance/VarUnusedInDisjunctQuery.qll new file mode 100644 index 00000000000..503ad6db3ad --- /dev/null +++ b/ql/ql/src/codeql_ql/performance/VarUnusedInDisjunctQuery.qll @@ -0,0 +1,27 @@ +import ql + +/** + * Holds if we assume `t` is a small type, and + * variables of this type are therefore not an issue in cartesian products. + */ +predicate isSmallType(Type t) { + t.getName() = "string" // DataFlow::Configuration and the like + or + exists(NewType newType | newType = t.getDeclaration() | + forex(NewTypeBranch branch | branch = newType.getABranch() | branch.getArity() = 0) + ) + or + t.getName() = "boolean" + or + exists(NewType newType | newType = t.getDeclaration() | + forex(NewTypeBranch branch | branch = newType.getABranch() | + isSmallType(branch.getReturnType()) + ) + ) + or + exists(NewTypeBranch branch | t = branch.getReturnType() | + forall(Type param | param = branch.getParameterType(_) | isSmallType(param)) + ) + or + isSmallType(t.getASuperType()) +} diff --git a/ql/ql/src/queries/performance/VarUnusedInDisjunct.ql b/ql/ql/src/queries/performance/VarUnusedInDisjunct.ql index c26b47554fe..2d85b872153 100644 --- a/ql/ql/src/queries/performance/VarUnusedInDisjunct.ql +++ b/ql/ql/src/queries/performance/VarUnusedInDisjunct.ql @@ -10,6 +10,7 @@ */ import ql +import codeql_ql.performance.VarUnusedInDisjunctQuery /** * Holds if `node` bind `var` in a (transitive) child node. @@ -48,32 +49,6 @@ predicate alwaysBindsVar(VarDef var, AstNode node) { exists(IfFormula ifForm | ifForm = node | alwaysBindsVar(var, ifForm.getCondition())) } -/** - * Holds if we assume `t` is a small type, and - * variables of this type are therefore not an issue in cartesian products. - */ -predicate isSmallType(Type t) { - t.getName() = "string" // DataFlow::Configuration and the like - or - exists(NewType newType | newType = t.getDeclaration() | - forex(NewTypeBranch branch | branch = newType.getABranch() | branch.getArity() = 0) - ) - or - t.getName() = "boolean" - or - exists(NewType newType | newType = t.getDeclaration() | - forex(NewTypeBranch branch | branch = newType.getABranch() | - isSmallType(branch.getReturnType()) - ) - ) - or - exists(NewTypeBranch branch | t = branch.getReturnType() | - forall(Type param | param = branch.getParameterType(_) | isSmallType(param)) - ) - or - isSmallType(t.getASuperType()) -} - /** * Holds if `pred` is inlined. */ diff --git a/ql/ql/src/queries/style/OverrideAny.ql b/ql/ql/src/queries/style/OverrideAny.ql new file mode 100644 index 00000000000..ed3a762f0e9 --- /dev/null +++ b/ql/ql/src/queries/style/OverrideAny.ql @@ -0,0 +1,41 @@ +/** + * @name Override with unmentioned parameter + * @description A predicate that overrides the default behavior but doesn't mention a parameter is suspicious. + * @kind problem + * @problem.severity warning + * @id ql/override-any + * @precision high + */ + +import ql +import codeql_ql.performance.VarUnusedInDisjunctQuery + +AstNode param(Predicate pred, string name, Type t) { + result = pred.getParameter(_) and + result.(VarDecl).getName() = name and + result.(VarDecl).getType() = t + or + result = pred.getReturnTypeExpr() and + name = "result" and + t = pred.getReturnType() +} + +predicate hasAccess(Predicate pred, string name) { + exists(param(pred, name, _).(VarDecl).getAnAccess()) + or + name = "result" and + exists(param(pred, name, _)) and + exists(ResultAccess res | res.getEnclosingPredicate() = pred) +} + +from Predicate pred, AstNode param, string name, Type paramType +where + pred.hasAnnotation("override") and + param = param(pred, name, paramType) and + not hasAccess(pred, name) and + not pred.getBody() instanceof NoneCall and + exists(pred.getBody()) and + not isSmallType(pred.getParent().(Class).getType()) and + not isSmallType(paramType) +select pred, "Override predicate doesn't mention $@. Maybe mention it in a 'exists(" + name + ")'?", + param, name diff --git a/ruby/ql/consistency-queries/SsaConsistency.ql b/ruby/ql/consistency-queries/SsaConsistency.ql index 54c1b149ab2..7ba9262baa4 100644 --- a/ruby/ql/consistency-queries/SsaConsistency.ql +++ b/ruby/ql/consistency-queries/SsaConsistency.ql @@ -1,18 +1,10 @@ import codeql.ruby.dataflow.SSA -import codeql.ruby.dataflow.internal.SsaImpl::Consistency as Consistency +import codeql.ruby.dataflow.internal.SsaImpl::Consistency -class MyRelevantDefinition extends Consistency::RelevantDefinition, Ssa::Definition { +class MyRelevantDefinition extends RelevantDefinition, Ssa::Definition { override predicate hasLocationInfo( string filepath, int startline, int startcolumn, int endline, int endcolumn ) { this.getLocation().hasLocationInfo(filepath, startline, startcolumn, endline, endcolumn) } } - -query predicate nonUniqueDef = Consistency::nonUniqueDef/4; - -query predicate readWithoutDef = Consistency::readWithoutDef/3; - -query predicate deadDef = Consistency::deadDef/2; - -query predicate notDominatedByDef = Consistency::notDominatedByDef/4; diff --git a/ruby/ql/lib/change-notes/2022-11-09-actioncable-channels.md b/ruby/ql/lib/change-notes/2022-11-09-actioncable-channels.md new file mode 100644 index 00000000000..3248fc194e0 --- /dev/null +++ b/ruby/ql/lib/change-notes/2022-11-09-actioncable-channels.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +* Arguments to RPC endpoints (public methods) on subclasses of `ActionCable::Channel::Base` are now recognized as sources of remote user input. diff --git a/ruby/ql/lib/change-notes/2022-11-10-arel-sql.md b/ruby/ql/lib/change-notes/2022-11-10-arel-sql.md new file mode 100644 index 00000000000..e803d0e0895 --- /dev/null +++ b/ruby/ql/lib/change-notes/2022-11-10-arel-sql.md @@ -0,0 +1,5 @@ +--- +category: minorAnalysis +--- +* The `codeql.ruby.Concepts` library now has a `SqlConstruction` class, in addition to the existing `SqlExecution` class. +* Calls to `Arel.sql` are now modeled as instances of the new `SqlConstruction` concept. diff --git a/ruby/ql/lib/change-notes/2022-11-14-activesupport-enumerable-index-by.md b/ruby/ql/lib/change-notes/2022-11-14-activesupport-enumerable-index-by.md new file mode 100644 index 00000000000..812c292dd94 --- /dev/null +++ b/ruby/ql/lib/change-notes/2022-11-14-activesupport-enumerable-index-by.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +* Data flow through the `ActiveSupport` extension `Enumerable#index_by` is now modeled. diff --git a/ruby/ql/lib/codeql/ruby/Concepts.qll b/ruby/ql/lib/codeql/ruby/Concepts.qll index 2d70cc31796..54eee4e3e0a 100644 --- a/ruby/ql/lib/codeql/ruby/Concepts.qll +++ b/ruby/ql/lib/codeql/ruby/Concepts.qll @@ -11,9 +11,47 @@ private import codeql.ruby.Frameworks private import codeql.ruby.dataflow.RemoteFlowSources private import codeql.ruby.ApiGraphs +/** + * A data-flow node that constructs a SQL statement. + * + * Often, it is worthy of an alert if a SQL statement is constructed such that + * executing it would be a security risk. + * + * If it is important that the SQL statement is executed, use `SqlExecution`. + * + * Extend this class to refine existing API models. If you want to model new APIs, + * extend `SqlConstruction::Range` instead. + */ +class SqlConstruction extends DataFlow::Node instanceof SqlConstruction::Range { + /** Gets the argument that specifies the SQL statements to be constructed. */ + DataFlow::Node getSql() { result = super.getSql() } +} + +/** Provides a class for modeling new SQL execution APIs. */ +module SqlConstruction { + /** + * A data-flow node that constructs a SQL statement. + * + * Often, it is worthy of an alert if a SQL statement is constructed such that + * executing it would be a security risk. + * + * If it is important that the SQL statement is executed, use `SqlExecution`. + * + * Extend this class to model new APIs. If you want to refine existing API models, + * extend `SqlConstruction` instead. + */ + abstract class Range extends DataFlow::Node { + /** Gets the argument that specifies the SQL statements to be constructed. */ + abstract DataFlow::Node getSql(); + } +} + /** * A data-flow node that executes SQL statements. * + * If the context of interest is such that merely constructing a SQL statement + * would be valuable to report, consider using `SqlConstruction`. + * * Extend this class to refine existing API models. If you want to model new APIs, * extend `SqlExecution::Range` instead. */ @@ -27,6 +65,9 @@ module SqlExecution { /** * A data-flow node that executes SQL statements. * + * If the context of interest is such that merely constructing a SQL + * statement would be valuable to report, consider using `SqlConstruction`. + * * Extend this class to model new APIs. If you want to refine existing API models, * extend `SqlExecution` instead. */ diff --git a/ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowImplConsistency.qll b/ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowImplConsistency.qll index dde16ab5a2a..f681e90aa21 100644 --- a/ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowImplConsistency.qll +++ b/ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowImplConsistency.qll @@ -136,6 +136,18 @@ module Consistency { msg = "Local flow step does not preserve enclosing callable." } + query predicate readStepIsLocal(Node n1, Node n2, string msg) { + readStep(n1, _, n2) and + nodeGetEnclosingCallable(n1) != nodeGetEnclosingCallable(n2) and + msg = "Read step does not preserve enclosing callable." + } + + query predicate storeStepIsLocal(Node n1, Node n2, string msg) { + storeStep(n1, _, n2) and + nodeGetEnclosingCallable(n1) != nodeGetEnclosingCallable(n2) and + msg = "Store step does not preserve enclosing callable." + } + private DataFlowType typeRepr() { result = getNodeType(_) } query predicate compatibleTypesReflexive(DataFlowType t, string msg) { diff --git a/ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowPublic.qll b/ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowPublic.qll index e04449fbeaa..bc903c64f84 100644 --- a/ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowPublic.qll +++ b/ruby/ql/lib/codeql/ruby/dataflow/internal/DataFlowPublic.qll @@ -1058,6 +1058,15 @@ class MethodNode extends CallableNode { /** Gets the name of this method. */ string getMethodName() { result = this.asCallableAstNode().getName() } + + /** Holds if this method is public. */ + predicate isPublic() { this.asCallableAstNode().isPublic() } + + /** Holds if this method is private. */ + predicate isPrivate() { this.asCallableAstNode().isPrivate() } + + /** Holds if this method is protected. */ + predicate isProtected() { this.asCallableAstNode().isProtected() } } /** diff --git a/ruby/ql/lib/codeql/ruby/dataflow/internal/SsaImpl.qll b/ruby/ql/lib/codeql/ruby/dataflow/internal/SsaImpl.qll index 97b49b5f5a0..c7154b50d0f 100644 --- a/ruby/ql/lib/codeql/ruby/dataflow/internal/SsaImpl.qll +++ b/ruby/ql/lib/codeql/ruby/dataflow/internal/SsaImpl.qll @@ -30,7 +30,8 @@ private module SsaInput implements SsaImplCommon::InputSig { i = 0 or // ...or a class or module block. - bb.getNode(i).getNode() = scope.(ModuleBase).getAControlFlowEntryNode() + bb.getNode(i).getNode() = scope.(ModuleBase).getAControlFlowEntryNode() and + not scope instanceof Toplevel // handled by case above ) or uninitializedWrite(bb, i, v) diff --git a/ruby/ql/lib/codeql/ruby/frameworks/ActionCable.qll b/ruby/ql/lib/codeql/ruby/frameworks/ActionCable.qll index e306b7d2b99..a586abc977c 100644 --- a/ruby/ql/lib/codeql/ruby/frameworks/ActionCable.qll +++ b/ruby/ql/lib/codeql/ruby/frameworks/ActionCable.qll @@ -6,6 +6,8 @@ private import codeql.ruby.AST private import codeql.ruby.Concepts private import codeql.ruby.ApiGraphs +private import codeql.ruby.DataFlow +private import codeql.ruby.dataflow.RemoteFlowSources private import codeql.ruby.frameworks.stdlib.Logger::Logger as StdlibLogger /** @@ -26,4 +28,36 @@ module ActionCable { } } } + + private DataFlow::ConstRef getActionCableChannelBase() { + result = DataFlow::getConstant("ActionCable").getConstant("Channel").getConstant("Base") + } + + /** + * The data argument in an RPC endpoint method on a subclass of + * `ActionCable::Channel::Base`, considered as a remote flow source. + */ + class ActionCableChannelRpcParam extends RemoteFlowSource::Range { + ActionCableChannelRpcParam() { + exists(DataFlow::MethodNode m | + // Any method on a subclass of `ActionCable::Channel::Base` + // automatically becomes an RPC endpoint + m = getActionCableChannelBase().getADescendentModule().getAnInstanceMethod() and + // as long as it's not an instance method of + // `ActionCable::Channel::Base` itself, which might exist in the + // database + not m = getActionCableChannelBase().asModule().getAnInstanceMethod() and + // and as long as it's public + m.isPublic() and + // and is not called `subscribed` or `unsubscribed`. + not m.getMethodName() = ["subscribed", "unsubscribed"] + | + // If the method takes a parameter, it contains data from the remote + // request. + this = m.getParameter(0) + ) + } + + override string getSourceType() { result = "ActionCable channel RPC data" } + } } diff --git a/ruby/ql/lib/codeql/ruby/frameworks/ActiveSupport.qll b/ruby/ql/lib/codeql/ruby/frameworks/ActiveSupport.qll index 47742531e30..7d3286a0a74 100644 --- a/ruby/ql/lib/codeql/ruby/frameworks/ActiveSupport.qll +++ b/ruby/ql/lib/codeql/ruby/frameworks/ActiveSupport.qll @@ -284,7 +284,17 @@ module ActiveSupport { preservesValue = true } } - // TODO: index_by, index_with, pick, pluck (they require Hash dataflow) + + private class IndexBySummary extends SimpleSummarizedCallable { + IndexBySummary() { this = "index_by" } + + override predicate propagatesFlowExt(string input, string output, boolean preservesValue) { + input = "Argument[self].Element[any]" and + output = ["Argument[block].Parameter[0]", "ReturnValue.Element[?]"] and + preservesValue = true + } + } + // TODO: index_with, pick, pluck (they require Hash dataflow) } } diff --git a/ruby/ql/lib/codeql/ruby/frameworks/Arel.qll b/ruby/ql/lib/codeql/ruby/frameworks/Arel.qll index 9fa17f4e5a5..f57fa41c740 100644 --- a/ruby/ql/lib/codeql/ruby/frameworks/Arel.qll +++ b/ruby/ql/lib/codeql/ruby/frameworks/Arel.qll @@ -6,6 +6,7 @@ private import codeql.ruby.ApiGraphs private import codeql.ruby.dataflow.FlowSummary +private import codeql.ruby.Concepts /** * Provides modeling for Arel, a low level SQL library that powers ActiveRecord. @@ -28,4 +29,14 @@ module Arel { input = "Argument[0]" and output = "ReturnValue" and preservesValue = false } } + + /** A call to `Arel.sql`, considered as a SQL construction. */ + private class ArelSqlConstruction extends SqlConstruction::Range, DataFlow::CallNode { + ArelSqlConstruction() { + this = DataFlow::getConstant("Arel").getAMethodCall() and + this.getMethodName() = "sql" + } + + override DataFlow::Node getSql() { result = this.getArgument(0) } + } } diff --git a/ruby/ql/lib/codeql/ruby/security/SqlInjectionCustomizations.qll b/ruby/ql/lib/codeql/ruby/security/SqlInjectionCustomizations.qll new file mode 100644 index 00000000000..66d3b0d4afd --- /dev/null +++ b/ruby/ql/lib/codeql/ruby/security/SqlInjectionCustomizations.qll @@ -0,0 +1,55 @@ +/** + * Provides default sources, sinks and sanitizers for detecting SQL injection + * vulnerabilities, as well as extension points for adding your own. + */ + +private import codeql.ruby.Concepts +private import codeql.ruby.DataFlow +private import codeql.ruby.dataflow.BarrierGuards +private import codeql.ruby.dataflow.RemoteFlowSources + +/** + * Provides default sources, sinks and sanitizers for detecting SQL injection + * vulnerabilities, as well as extension points for adding your own. + */ +module SqlInjection { + /** A data flow source for SQL injection vulnerabilities. */ + abstract class Source extends DataFlow::Node { } + + /** A data flow sink for SQL injection vulnerabilities. */ + abstract class Sink extends DataFlow::Node { } + + /** A sanitizer for SQL injection vulnerabilities. */ + abstract class Sanitizer extends DataFlow::Node { } + + /** + * A source of remote user input, considered as a flow source. + */ + private class RemoteFlowSourceAsSource extends Source, RemoteFlowSource { } + + /** + * An SQL statement of a SQL execution, considered as a flow sink. + */ + private class SqlExecutionAsSink extends Sink { + SqlExecutionAsSink() { this = any(SqlExecution e).getSql() } + } + + /** + * An SQL statement of a SQL construction, considered as a flow sink. + */ + private class SqlConstructionAsSink extends Sink { + SqlConstructionAsSink() { this = any(SqlConstruction e).getSql() } + } + + /** + * A comparison with a constant string, considered as a sanitizer-guard. + */ + private class StringConstCompareAsSanitizerGuard extends Sanitizer, StringConstCompareBarrier { } + + /** + * An inclusion check against an array of constant strings, considered as a + * sanitizer-guard. + */ + class StringConstArrayInclusionCallAsSanitizer extends Sanitizer, + StringConstArrayInclusionCallBarrier { } +} diff --git a/ruby/ql/lib/codeql/ruby/security/SqlInjectionQuery.qll b/ruby/ql/lib/codeql/ruby/security/SqlInjectionQuery.qll new file mode 100644 index 00000000000..f74e919ffe5 --- /dev/null +++ b/ruby/ql/lib/codeql/ruby/security/SqlInjectionQuery.qll @@ -0,0 +1,21 @@ +/** + * Provides default sources, sinks and sanitizers for detecting SQL injection + * vulnerabilities, as well as extension points for adding your own. + */ + +private import codeql.ruby.DataFlow +private import codeql.ruby.TaintTracking +import SqlInjectionCustomizations::SqlInjection + +/** + * A taint-tracking configuration for detecting SQL injection vulnerabilities. + */ +class Configuration extends TaintTracking::Configuration { + Configuration() { this = "SqlInjectionConfiguration" } + + override predicate isSource(DataFlow::Node source) { source instanceof Source } + + override predicate isSink(DataFlow::Node source) { source instanceof Sink } + + override predicate isSanitizer(DataFlow::Node node) { node instanceof Sanitizer } +} diff --git a/ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll b/ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll index f49696c5bad..7bc61ee2aee 100644 --- a/ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll +++ b/ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll @@ -103,7 +103,7 @@ module HeuristicNames { */ string notSensitiveRegexp() { result = - "(?is).*([^\\w$.-]|redact|censor|obfuscate|hash|md5|sha|random|((? self | parameters.rb:1:1:62:1 | self | | parameters.rb:1:14:1:14 | x | parameters.rb:1:14:1:14 | x | | parameters.rb:2:4:2:8 | ... = ... | parameters.rb:1:18:1:18 | y | @@ -87,7 +86,7 @@ definition | parameters.rb:59:20:59:20 | a | parameters.rb:59:20:59:20 | a | | parameters.rb:59:23:59:23 | b | parameters.rb:59:23:59:23 | b | | parameters.rb:59:25:59:25 | c | parameters.rb:59:25:59:25 | c | -| scopes.rb:1:1:1:15 | self (scopes.rb) | scopes.rb:1:1:49:4 | self | +| scopes.rb:1:1:49:4 | self (scopes.rb) | scopes.rb:1:1:49:4 | self | | scopes.rb:2:9:6:3 | self | scopes.rb:1:1:49:4 | self | | scopes.rb:4:4:4:8 | ... = ... | scopes.rb:4:4:4:4 | a | | scopes.rb:7:1:7:5 | ... = ... | scopes.rb:7:1:7:1 | a | @@ -166,17 +165,17 @@ definition | ssa.rb:84:10:86:8 | captured | ssa.rb:82:3:82:10 | captured | | ssa.rb:84:10:86:8 | self | ssa.rb:81:1:88:3 | self | read -| class_variables.rb:1:1:1:3 | self (class_variables.rb) | class_variables.rb:1:1:29:4 | self | class_variables.rb:3:1:3:5 | self | +| class_variables.rb:1:1:29:4 | self (class_variables.rb) | class_variables.rb:1:1:29:4 | self | class_variables.rb:3:1:3:5 | self | | class_variables.rb:5:1:7:3 | self (print) | class_variables.rb:5:1:7:3 | self | class_variables.rb:6:2:6:6 | self | | class_variables.rb:9:1:16:3 | self (X) | class_variables.rb:9:1:16:3 | self | class_variables.rb:13:7:13:10 | self | | class_variables.rb:10:3:12:5 | self (b) | class_variables.rb:10:3:12:5 | self | class_variables.rb:11:5:11:9 | self | | class_variables.rb:13:3:15:5 | self (s) | class_variables.rb:13:3:15:5 | self | class_variables.rb:14:4:14:8 | self | | class_variables.rb:26:1:29:3 | self (N) | class_variables.rb:26:1:29:3 | self | class_variables.rb:27:3:27:11 | self | | class_variables.rb:26:1:29:3 | self (N) | class_variables.rb:26:1:29:3 | self | class_variables.rb:28:3:28:7 | self | -| instance_variables.rb:1:1:1:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:11:1:11:9 | self | -| instance_variables.rb:1:1:1:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:11:6:11:9 | self | -| instance_variables.rb:1:1:1:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:27:1:29:1 | self | | instance_variables.rb:1:1:44:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:1:1:1:4 | self | +| instance_variables.rb:1:1:44:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:11:1:11:9 | self | +| instance_variables.rb:1:1:44:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:11:6:11:9 | self | +| instance_variables.rb:1:1:44:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:27:1:29:1 | self | | instance_variables.rb:3:1:5:3 | self (foo) | instance_variables.rb:3:1:5:3 | self | instance_variables.rb:4:3:4:6 | self | | instance_variables.rb:7:1:9:3 | self (print_foo) | instance_variables.rb:7:1:9:3 | self | instance_variables.rb:8:3:8:11 | self | | instance_variables.rb:7:1:9:3 | self (print_foo) | instance_variables.rb:7:1:9:3 | self | instance_variables.rb:8:8:8:11 | self | @@ -259,7 +258,7 @@ read | parameters.rb:59:20:59:20 | a | parameters.rb:59:20:59:20 | a | parameters.rb:60:11:60:11 | a | | parameters.rb:59:23:59:23 | b | parameters.rb:59:23:59:23 | b | parameters.rb:60:16:60:16 | b | | parameters.rb:59:25:59:25 | c | parameters.rb:59:25:59:25 | c | parameters.rb:60:21:60:21 | c | -| scopes.rb:1:1:1:15 | self (scopes.rb) | scopes.rb:1:1:49:4 | self | scopes.rb:8:1:8:6 | self | +| scopes.rb:1:1:49:4 | self (scopes.rb) | scopes.rb:1:1:49:4 | self | scopes.rb:8:1:8:6 | self | | scopes.rb:2:9:6:3 | self | scopes.rb:1:1:49:4 | self | scopes.rb:3:4:3:9 | self | | scopes.rb:2:9:6:3 | self | scopes.rb:1:1:49:4 | self | scopes.rb:3:9:3:9 | self | | scopes.rb:2:9:6:3 | self | scopes.rb:1:1:49:4 | self | scopes.rb:5:4:5:9 | self | @@ -349,13 +348,12 @@ read | ssa.rb:84:10:86:8 | captured | ssa.rb:82:3:82:10 | captured | ssa.rb:85:15:85:22 | captured | | ssa.rb:84:10:86:8 | self | ssa.rb:81:1:88:3 | self | ssa.rb:85:10:85:22 | self | firstRead -| class_variables.rb:1:1:1:3 | self (class_variables.rb) | class_variables.rb:1:1:29:4 | self | class_variables.rb:3:1:3:5 | self | +| class_variables.rb:1:1:29:4 | self (class_variables.rb) | class_variables.rb:1:1:29:4 | self | class_variables.rb:3:1:3:5 | self | | class_variables.rb:5:1:7:3 | self (print) | class_variables.rb:5:1:7:3 | self | class_variables.rb:6:2:6:6 | self | | class_variables.rb:9:1:16:3 | self (X) | class_variables.rb:9:1:16:3 | self | class_variables.rb:13:7:13:10 | self | | class_variables.rb:10:3:12:5 | self (b) | class_variables.rb:10:3:12:5 | self | class_variables.rb:11:5:11:9 | self | | class_variables.rb:13:3:15:5 | self (s) | class_variables.rb:13:3:15:5 | self | class_variables.rb:14:4:14:8 | self | | class_variables.rb:26:1:29:3 | self (N) | class_variables.rb:26:1:29:3 | self | class_variables.rb:27:3:27:11 | self | -| instance_variables.rb:1:1:1:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:11:1:11:9 | self | | instance_variables.rb:1:1:44:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:1:1:1:4 | self | | instance_variables.rb:3:1:5:3 | self (foo) | instance_variables.rb:3:1:5:3 | self | instance_variables.rb:4:3:4:6 | self | | instance_variables.rb:7:1:9:3 | self (print_foo) | instance_variables.rb:7:1:9:3 | self | instance_variables.rb:8:3:8:11 | self | @@ -428,7 +426,7 @@ firstRead | parameters.rb:59:20:59:20 | a | parameters.rb:59:20:59:20 | a | parameters.rb:60:11:60:11 | a | | parameters.rb:59:23:59:23 | b | parameters.rb:59:23:59:23 | b | parameters.rb:60:16:60:16 | b | | parameters.rb:59:25:59:25 | c | parameters.rb:59:25:59:25 | c | parameters.rb:60:21:60:21 | c | -| scopes.rb:1:1:1:15 | self (scopes.rb) | scopes.rb:1:1:49:4 | self | scopes.rb:8:1:8:6 | self | +| scopes.rb:1:1:49:4 | self (scopes.rb) | scopes.rb:1:1:49:4 | self | scopes.rb:8:1:8:6 | self | | scopes.rb:2:9:6:3 | self | scopes.rb:1:1:49:4 | self | scopes.rb:3:4:3:9 | self | | scopes.rb:4:4:4:8 | ... = ... | scopes.rb:4:4:4:4 | a | scopes.rb:5:9:5:9 | a | | scopes.rb:7:1:7:5 | ... = ... | scopes.rb:7:1:7:1 | a | scopes.rb:8:6:8:6 | a | @@ -488,15 +486,13 @@ firstRead | ssa.rb:84:10:86:8 | captured | ssa.rb:82:3:82:10 | captured | ssa.rb:85:15:85:22 | captured | | ssa.rb:84:10:86:8 | self | ssa.rb:81:1:88:3 | self | ssa.rb:85:10:85:22 | self | lastRead -| class_variables.rb:1:1:1:3 | self (class_variables.rb) | class_variables.rb:1:1:29:4 | self | class_variables.rb:3:1:3:5 | self | +| class_variables.rb:1:1:29:4 | self (class_variables.rb) | class_variables.rb:1:1:29:4 | self | class_variables.rb:3:1:3:5 | self | | class_variables.rb:5:1:7:3 | self (print) | class_variables.rb:5:1:7:3 | self | class_variables.rb:6:2:6:6 | self | | class_variables.rb:9:1:16:3 | self (X) | class_variables.rb:9:1:16:3 | self | class_variables.rb:13:7:13:10 | self | | class_variables.rb:10:3:12:5 | self (b) | class_variables.rb:10:3:12:5 | self | class_variables.rb:11:5:11:9 | self | | class_variables.rb:13:3:15:5 | self (s) | class_variables.rb:13:3:15:5 | self | class_variables.rb:14:4:14:8 | self | | class_variables.rb:26:1:29:3 | self (N) | class_variables.rb:26:1:29:3 | self | class_variables.rb:28:3:28:7 | self | -| instance_variables.rb:1:1:1:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:1:1:1:4 | self | -| instance_variables.rb:1:1:1:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:27:1:29:1 | self | -| instance_variables.rb:1:1:44:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:1:1:1:4 | self | +| instance_variables.rb:1:1:44:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:27:1:29:1 | self | | instance_variables.rb:3:1:5:3 | self (foo) | instance_variables.rb:3:1:5:3 | self | instance_variables.rb:4:3:4:6 | self | | instance_variables.rb:7:1:9:3 | self (print_foo) | instance_variables.rb:7:1:9:3 | self | instance_variables.rb:8:8:8:11 | self | | instance_variables.rb:13:1:18:3 | self (X) | instance_variables.rb:13:1:18:3 | self | instance_variables.rb:14:3:14:4 | self | @@ -568,7 +564,7 @@ lastRead | parameters.rb:59:20:59:20 | a | parameters.rb:59:20:59:20 | a | parameters.rb:60:11:60:11 | a | | parameters.rb:59:23:59:23 | b | parameters.rb:59:23:59:23 | b | parameters.rb:60:16:60:16 | b | | parameters.rb:59:25:59:25 | c | parameters.rb:59:25:59:25 | c | parameters.rb:60:21:60:21 | c | -| scopes.rb:1:1:1:15 | self (scopes.rb) | scopes.rb:1:1:49:4 | self | scopes.rb:8:1:8:6 | self | +| scopes.rb:1:1:49:4 | self (scopes.rb) | scopes.rb:1:1:49:4 | self | scopes.rb:8:1:8:6 | self | | scopes.rb:2:9:6:3 | self | scopes.rb:1:1:49:4 | self | scopes.rb:5:4:5:9 | self | | scopes.rb:4:4:4:8 | ... = ... | scopes.rb:4:4:4:4 | a | scopes.rb:5:9:5:9 | a | | scopes.rb:7:1:7:5 | ... = ... | scopes.rb:7:1:7:1 | a | scopes.rb:8:6:8:6 | a | @@ -630,9 +626,9 @@ lastRead | ssa.rb:84:10:86:8 | self | ssa.rb:81:1:88:3 | self | ssa.rb:85:10:85:22 | self | adjacentReads | class_variables.rb:26:1:29:3 | self (N) | class_variables.rb:26:1:29:3 | self | class_variables.rb:27:3:27:11 | self | class_variables.rb:28:3:28:7 | self | -| instance_variables.rb:1:1:1:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:1:1:1:4 | self | instance_variables.rb:11:1:11:9 | self | -| instance_variables.rb:1:1:1:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:11:1:11:9 | self | instance_variables.rb:11:6:11:9 | self | -| instance_variables.rb:1:1:1:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:11:6:11:9 | self | instance_variables.rb:27:1:29:1 | self | +| instance_variables.rb:1:1:44:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:1:1:1:4 | self | instance_variables.rb:11:1:11:9 | self | +| instance_variables.rb:1:1:44:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:11:1:11:9 | self | instance_variables.rb:11:6:11:9 | self | +| instance_variables.rb:1:1:44:4 | self (instance_variables.rb) | instance_variables.rb:1:1:44:4 | self | instance_variables.rb:11:6:11:9 | self | instance_variables.rb:27:1:29:1 | self | | instance_variables.rb:7:1:9:3 | self (print_foo) | instance_variables.rb:7:1:9:3 | self | instance_variables.rb:8:3:8:11 | self | instance_variables.rb:8:8:8:11 | self | | instance_variables.rb:37:3:43:5 | self (x) | instance_variables.rb:37:3:43:5 | self | instance_variables.rb:41:4:41:4 | self | instance_variables.rb:42:4:42:7 | self | | instance_variables.rb:37:3:43:5 | self (x) | instance_variables.rb:37:3:43:5 | self | instance_variables.rb:42:4:42:7 | self | instance_variables.rb:42:6:42:7 | self | diff --git a/ruby/ql/test/query-tests/security/cwe-089/ArelInjection.rb b/ruby/ql/test/query-tests/security/cwe-089/ArelInjection.rb new file mode 100644 index 00000000000..a1efb3adabb --- /dev/null +++ b/ruby/ql/test/query-tests/security/cwe-089/ArelInjection.rb @@ -0,0 +1,8 @@ + +class PotatoController < ActionController::Base + def unsafe_action + name = params[:user_name] + # BAD: SQL statement constructed from user input + sql = Arel.sql("SELECT * FROM users WHERE name = #{name}") + end +end \ No newline at end of file diff --git a/ruby/ql/test/query-tests/security/cwe-089/SqlInjection.expected b/ruby/ql/test/query-tests/security/cwe-089/SqlInjection.expected index d78b2675f25..d664bad4293 100644 --- a/ruby/ql/test/query-tests/security/cwe-089/SqlInjection.expected +++ b/ruby/ql/test/query-tests/security/cwe-089/SqlInjection.expected @@ -33,6 +33,8 @@ edges | ActiveRecordInjection.rb:137:21:137:44 | ...[...] : | ActiveRecordInjection.rb:20:22:20:30 | condition : | | ActiveRecordInjection.rb:151:59:151:64 | call to params : | ActiveRecordInjection.rb:151:59:151:74 | ...[...] : | | ActiveRecordInjection.rb:151:59:151:74 | ...[...] : | ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." | +| ArelInjection.rb:4:12:4:17 | call to params : | ArelInjection.rb:4:12:4:29 | ...[...] : | +| ArelInjection.rb:4:12:4:29 | ...[...] : | ArelInjection.rb:6:20:6:61 | "SELECT * FROM users WHERE nam..." | nodes | ActiveRecordInjection.rb:8:25:8:28 | name : | semmle.label | name : | | ActiveRecordInjection.rb:8:31:8:34 | pass : | semmle.label | pass : | @@ -85,6 +87,9 @@ nodes | ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." | semmle.label | "this is an unsafe annotation:..." | | ActiveRecordInjection.rb:151:59:151:64 | call to params : | semmle.label | call to params : | | ActiveRecordInjection.rb:151:59:151:74 | ...[...] : | semmle.label | ...[...] : | +| ArelInjection.rb:4:12:4:17 | call to params : | semmle.label | call to params : | +| ArelInjection.rb:4:12:4:29 | ...[...] : | semmle.label | ...[...] : | +| ArelInjection.rb:6:20:6:61 | "SELECT * FROM users WHERE nam..." | semmle.label | "SELECT * FROM users WHERE nam..." | subpaths #select | ActiveRecordInjection.rb:10:33:10:67 | "name='#{...}' and pass='#{...}'" | ActiveRecordInjection.rb:70:23:70:28 | call to params : | ActiveRecordInjection.rb:10:33:10:67 | "name='#{...}' and pass='#{...}'" | This SQL query depends on a $@. | ActiveRecordInjection.rb:70:23:70:28 | call to params | user-provided value | @@ -105,3 +110,4 @@ subpaths | ActiveRecordInjection.rb:92:21:92:35 | ...[...] | ActiveRecordInjection.rb:92:21:92:26 | call to params : | ActiveRecordInjection.rb:92:21:92:35 | ...[...] | This SQL query depends on a $@. | ActiveRecordInjection.rb:92:21:92:26 | call to params | user-provided value | | ActiveRecordInjection.rb:104:20:104:32 | ... + ... | ActiveRecordInjection.rb:98:10:98:15 | call to params : | ActiveRecordInjection.rb:104:20:104:32 | ... + ... | This SQL query depends on a $@. | ActiveRecordInjection.rb:98:10:98:15 | call to params | user-provided value | | ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." | ActiveRecordInjection.rb:151:59:151:64 | call to params : | ActiveRecordInjection.rb:151:27:151:76 | "this is an unsafe annotation:..." | This SQL query depends on a $@. | ActiveRecordInjection.rb:151:59:151:64 | call to params | user-provided value | +| ArelInjection.rb:6:20:6:61 | "SELECT * FROM users WHERE nam..." | ArelInjection.rb:4:12:4:17 | call to params : | ArelInjection.rb:6:20:6:61 | "SELECT * FROM users WHERE nam..." | This SQL query depends on a $@. | ArelInjection.rb:4:12:4:17 | call to params | user-provided value | diff --git a/shared/regex/change-notes/2022-09-26-initial-version.md b/shared/regex/change-notes/2022-09-26-initial-version.md new file mode 100644 index 00000000000..e4d6e0490c2 --- /dev/null +++ b/shared/regex/change-notes/2022-09-26-initial-version.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +* Initial release. Extracted common regex related code, including the ReDoS analysis, into a library pack to share code between languages. diff --git a/shared/regex/codeql-pack.lock.yml b/shared/regex/codeql-pack.lock.yml new file mode 100644 index 00000000000..a046f6d9786 --- /dev/null +++ b/shared/regex/codeql-pack.lock.yml @@ -0,0 +1,4 @@ +--- +dependencies: {} +compiled: false +lockVersion: 1.0.0 \ No newline at end of file diff --git a/shared/regex/codeql/regex/OverlyLargeRangeQuery.qll b/shared/regex/codeql/regex/OverlyLargeRangeQuery.qll new file mode 100644 index 00000000000..8d3a0b9c0ff --- /dev/null +++ b/shared/regex/codeql/regex/OverlyLargeRangeQuery.qll @@ -0,0 +1,300 @@ +/** + * Classes and predicates for working with suspicious character ranges. + */ + +private import RegexTreeView + +/** + * Classes and predicates implementing an analysis detecting suspicious character ranges. + */ +module Make { + private import TreeImpl + + /** + * Gets a rank for `range` that is unique for ranges in the same file. + * Prioritizes ranges that match more characters. + */ + int rankRange(RegExpCharacterRange range) { + range = + rank[result](RegExpCharacterRange r, int startline, int startcolumn, int low, int high | + r.hasLocationInfo(_, startline, startcolumn, _, _) and + isRange(r, low, high) + | + r order by (high - low) desc, startline, startcolumn + ) + } + + /** Holds if `range` spans from the unicode code points `low` to `high` (both inclusive). */ + predicate isRange(RegExpCharacterRange range, int low, int high) { + exists(string lowc, string highc | + range.isRange(lowc, highc) and + low.toUnicode() = lowc and + high.toUnicode() = highc + ) + } + + /** Holds if `char` is an alpha-numeric character. */ + predicate isAlphanumeric(string char) { + // written like this to avoid having a bindingset for the predicate + char = [[48 .. 57], [65 .. 90], [97 .. 122]].toUnicode() // 0-9, A-Z, a-z + } + + /** + * Holds if the given ranges are from the same character class + * and there exists at least one character matched by both ranges. + */ + predicate overlap(RegExpCharacterRange a, RegExpCharacterRange b) { + exists(RegExpCharacterClass clz | + a = clz.getAChild() and + b = clz.getAChild() and + a != b + | + exists(int alow, int ahigh, int blow, int bhigh | + isRange(a, alow, ahigh) and + isRange(b, blow, bhigh) and + alow <= bhigh and + blow <= ahigh + ) + ) + } + + /** + * Holds if `range` overlaps with the char class `escape` from the same character class. + */ + predicate overlapsWithCharEscape(RegExpCharacterRange range, RegExpCharacterClassEscape escape) { + exists(RegExpCharacterClass clz, string low, string high | + range = clz.getAChild() and + escape = clz.getAChild() and + range.isRange(low, high) + | + escape.getValue() = "w" and + getInRange(low, high).regexpMatch("\\w") + or + escape.getValue() = "d" and + getInRange(low, high).regexpMatch("\\d") + or + escape.getValue() = "s" and + getInRange(low, high).regexpMatch("\\s") + ) + } + + /** Gets the unicode code point for a `char`. */ + bindingset[char] + int toCodePoint(string char) { result.toUnicode() = char } + + /** A character range that appears to be overly wide. */ + class OverlyWideRange instanceof RegExpCharacterRange { + OverlyWideRange() { + exists(int low, int high, int numChars | + isRange(this, low, high) and + numChars = (1 + high - low) and + this.getRootTerm().isUsedAsRegExp() and + numChars >= 10 + | + // across the Z-a range (which includes backticks) + toCodePoint("Z") >= low and + toCodePoint("a") <= high + or + // across the 9-A range (which includes e.g. ; and ?) + toCodePoint("9") >= low and + toCodePoint("A") <= high + or + // a non-alphanumeric char as part of the range boundaries + exists(int bound | bound = [low, high] | not isAlphanumeric(bound.toUnicode())) and + // while still being ascii + low < 128 and + high < 128 + ) and + // allowlist for known ranges + not this = allowedWideRanges() + } + + /** Gets a string representation of a character class that matches the same chars as this range. */ + string printEquivalent() { result = RangePrinter::printEquivalentCharClass(this) } + + /** Gets a string representation of this range. */ + string toString() { result = super.toString() } + + /** Holds if `lo` is the lower bound of this character range and `hi` the upper bound. */ + predicate isRange(string lo, string hi) { super.isRange(lo, hi) } + } + + /** Gets a range that should not be reported as an overly wide range. */ + RegExpCharacterRange allowedWideRanges() { + // ~ is the last printable ASCII character, it's used right in various wide ranges. + result.isRange(_, "~") + or + // the same with " " and "!". " " is the first printable character, and "!" is the first non-white-space printable character. + result.isRange([" ", "!"], _) + or + // the `[@-_]` range is intentional + result.isRange("@", "_") + or + // starting from the zero byte is a good indication that it's purposely matching a large range. + result.isRange(0.toUnicode(), _) + } + + /** Gets a char between (and including) `low` and `high`. */ + bindingset[low, high] + private string getInRange(string low, string high) { + result = [toCodePoint(low) .. toCodePoint(high)].toUnicode() + } + + /** A module computing an equivalent character class for an overly wide range. */ + module RangePrinter { + bindingset[char] + bindingset[result] + private string next(string char) { + exists(int prev, int next | + prev.toUnicode() = char and + next.toUnicode() = result and + next = prev + 1 + ) + } + + /** Gets the points where the parts of the pretty printed range should be cut off. */ + private string cutoffs() { result = ["A", "Z", "a", "z", "0", "9"] } + + /** Gets the char to use in the low end of a range for a given `cut` */ + private string lowCut(string cut) { + cut = ["A", "a", "0"] and + result = cut + or + cut = ["Z", "z", "9"] and + result = next(cut) + } + + /** Gets the char to use in the high end of a range for a given `cut` */ + private string highCut(string cut) { + cut = ["Z", "z", "9"] and + result = cut + or + cut = ["A", "a", "0"] and + next(result) = cut + } + + /** Gets the cutoff char used for a given `part` of a range when pretty-printing it. */ + private string cutoff(OverlyWideRange range, int part) { + exists(int low, int high | isRange(range, low, high) | + result = + rank[part + 1](string cut | + cut = cutoffs() and low < toCodePoint(cut) and toCodePoint(cut) < high + | + cut order by toCodePoint(cut) + ) + ) + } + + /** Gets the number of parts we should print for a given `range`. */ + private int parts(OverlyWideRange range) { result = 1 + count(cutoff(range, _)) } + + /** Holds if the given part of a range should span from `low` to `high`. */ + private predicate part(OverlyWideRange range, int part, string low, string high) { + // first part. + part = 0 and + ( + range.isRange(low, high) and + parts(range) = 1 + or + parts(range) >= 2 and + range.isRange(low, _) and + high = highCut(cutoff(range, part)) + ) + or + // middle + part >= 1 and + part < parts(range) - 1 and + low = lowCut(cutoff(range, part - 1)) and + high = highCut(cutoff(range, part)) + or + // last. + part = parts(range) - 1 and + low = lowCut(cutoff(range, part - 1)) and + range.isRange(_, high) + } + + /** Gets an escaped `char` for use in a character class. */ + bindingset[char] + private string escape(string char) { + exists(string reg | reg = "(\\[|\\]|\\\\|-|/)" | + if char.regexpMatch(reg) then result = "\\" + char else result = char + ) + } + + /** Gets a part of the equivalent range. */ + private string printEquivalentCharClass(OverlyWideRange range, int part) { + exists(string low, string high | part(range, part, low, high) | + if + isAlphanumeric(low) and + isAlphanumeric(high) + then result = low + "-" + high + else + result = + strictconcat(string char | char = getInRange(low, high) | escape(char) order by char) + ) + } + + /** Gets the entire pretty printed equivalent range. */ + string printEquivalentCharClass(OverlyWideRange range) { + result = + strictconcat(string r, int part | + r = "[" and part = -1 and exists(range) + or + r = printEquivalentCharClass(range, part) + or + r = "]" and part = parts(range) + | + r order by part + ) + } + } + + /** Gets a char range that is overly large because of `reason`. */ + RegExpCharacterRange getABadRange(string reason, int priority) { + result instanceof OverlyWideRange and + priority = 0 and + exists(string equiv | equiv = result.(OverlyWideRange).printEquivalent() | + if equiv.length() <= 50 + then reason = "is equivalent to " + equiv + else reason = "is equivalent to " + equiv.substring(0, 50) + "..." + ) + or + priority = 1 and + exists(RegExpCharacterRange other | + reason = "overlaps with " + other + " in the same character class" and + rankRange(result) < rankRange(other) and + overlap(result, other) + ) + or + priority = 2 and + exists(RegExpCharacterClassEscape escape | + reason = "overlaps with " + escape + " in the same character class" and + overlapsWithCharEscape(result, escape) + ) + or + reason = "is empty" and + priority = 3 and + exists(int low, int high | + isRange(result, low, high) and + low > high + ) + } + + /** Holds if `range` matches suspiciously many characters. */ + predicate problem(RegExpCharacterRange range, string reason) { + reason = + strictconcat(string m, int priority | + range = getABadRange(m, priority) + | + m, ", and " order by priority desc + ) and + // specifying a range using an escape is usually OK. + not range.getAChild() instanceof RegExpEscape and + // Unicode escapes in strings are interpreted before it turns into a regexp, + // so e.g. [\u0001-\uFFFF] will just turn up as a range between two constants. + // We therefore exclude these ranges. + range.getRootTerm().getParent() instanceof RegExpLiteral and + // is used as regexp (mostly for JS where regular expressions are parsed eagerly) + range.getRootTerm().isUsedAsRegExp() + } +} diff --git a/shared/regex/codeql/regex/RegexTreeView.qll b/shared/regex/codeql/regex/RegexTreeView.qll new file mode 100644 index 00000000000..f805bd83185 --- /dev/null +++ b/shared/regex/codeql/regex/RegexTreeView.qll @@ -0,0 +1,451 @@ +/** + * This file contains a `RegexTreeViewSig` module describing the syntax tree of regular expressions. + */ + +/** + * A signature describing the syntax tree of regular expressions. + */ +signature module RegexTreeViewSig { + /** + * An element used in some way as or in a regular expression. + * This class exists to have a common supertype that all languages can agree on. + */ + class Top; + + /** + * An element containing a regular expression term, that is, either + * a string literal (parsed as a regular expression; the root of the parse tree) + * or another regular expression term (a descendant of the root). + */ + class RegExpParent extends Top; + + /** + * A regular expression literal. + * + * Note that this class does not cover regular expressions constructed by calling the built-in + * `RegExp` function. + * + * Example: + * + * ``` + * /(?i)ab*c(d|e)$/ + * ``` + */ + class RegExpLiteral extends RegExpParent; + + /** + * A regular expression term, that is, a syntactic part of a regular expression. + * These are the tree nodes that form the parse tree of a regular expression literal. + */ + class RegExpTerm extends Top { + /** Gets a child term of this term. */ + RegExpTerm getAChild(); + + /** + * Holds if this is the root term of a regular expression. + */ + predicate isRootTerm(); + + /** + * Gets the parent term of this regular expression term, or the + * regular expression literal if this is the root term. + */ + RegExpParent getParent(); + + /** + * Holds if this term is part of a regular expression literal, or a string literal + * that is interpreted as a regular expression. + */ + predicate isUsedAsRegExp(); + + /** Gets the outermost term of this regular expression. */ + RegExpTerm getRootTerm(); + + /** Gets the raw source text of this term. */ + string getRawValue(); + + /** Gets the `i`th child term of this term. */ + RegExpTerm getChild(int i); + + /** Gets the number of child terms of this term. */ + int getNumChild(); + + /** Gets the regular expression term that is matched (textually) after this one, if any. */ + RegExpTerm getSuccessor(); + + string toString(); + + predicate hasLocationInfo( + string filepath, int startline, int startcolumn, int endline, int endcolumn + ); + } + + /** + * A quantified regular expression term. + * + * Example: + * + * ``` + * ((ECMA|Java)[sS]cript)* + * ``` + */ + class RegExpQuantifier extends RegExpTerm; + + /** + * A star-quantified term. + * + * Example: + * + * ``` + * \w* + * ``` + */ + class RegExpStar extends RegExpQuantifier; + + /** + * An optional term. + * + * Example: + * + * ``` + * ;? + * ``` + */ + class RegExpOpt extends RegExpQuantifier; + + /** + * A plus-quantified term. + * + * Example: + * + * ``` + * \w+ + * ``` + */ + class RegExpPlus extends RegExpQuantifier; + + /** + * A range-quantified term + * + * Examples: + * + * ``` + * \w{2,4} + * \w{2,} + * \w{2} + * ``` + */ + class RegExpRange extends RegExpQuantifier { + /** Gets the lower bound of the range. */ + int getLowerBound(); + + /** + * Gets the upper bound of the range, if any. + * + * If there is no upper bound, any number of repetitions is allowed. + * For a term of the form `r{lo}`, both the lower and the upper bound + * are `lo`. + */ + int getUpperBound(); + } + + /** + * An escaped regular expression term, that is, a regular expression + * term starting with a backslash. + * + * Example: + * + * ``` + * \. + * \w + * ``` + */ + class RegExpEscape extends RegExpTerm; + + /** + * A character class escape in a regular expression. + * + * Examples: + * + * ``` + * \w + * \S + * ``` + */ + class RegExpCharacterClassEscape extends RegExpEscape { + /** Gets the name of the character class; for example, `w` for `\w`. */ + string getValue(); + } + + /** + * An alternative term, that is, a term of the form `a|b`. + * + * Example: + * + * ``` + * ECMA|Java + * ``` + */ + class RegExpAlt extends RegExpTerm; + + /** + * A grouped regular expression. + * + * Examples: + * + * ``` + * (ECMA|Java) + * (?:ECMA|Java) + * (? ['"]) + * ``` + */ + class RegExpGroup extends RegExpTerm { + /** + * Gets the index of this capture group within the enclosing regular + * expression literal. + * + * For example, in the regular expression `/((a?).)(?:b)/`, the + * group `((a?).)` has index 1, the group `(a?)` nested inside it + * has index 2, and the group `(?:b)` has no index, since it is + * not a capture group. + */ + int getNumber(); + } + + /** + * A back reference, that is, a term of the form `\i` or `\k` + * in a regular expression. + * + * Examples: + * + * ``` + * \1 + * \k + * ``` + */ + class RegExpBackRef extends RegExpTerm { + /** Gets the capture group this back reference refers to. */ + RegExpGroup getGroup(); + } + + /** + * A sequence term. + * + * Example: + * + * ``` + * (ECMA|Java)Script + * ``` + * + * This is a sequence with the elements `(ECMA|Java)` and `Script`. + */ + class RegExpSequence extends RegExpTerm; + + /** + * A zero-width lookahead or lookbehind assertion. + * + * Examples: + * + * ``` + * (?=\w) + * (?!\n) + * (?<=\.) + * (?&] + * ``` + */ + class RegExpCharacterClass extends RegExpTerm { + /** + * Holds if this character class matches any character. + */ + predicate isUniversalClass(); + + /** Holds if this is an inverted character class, that is, a term of the form `[^...]`. */ + predicate isInverted(); + } + + /** + * A character range in a character class in a regular expression. + * + * Example: + * + * ``` + * a-z + * ``` + */ + class RegExpCharacterRange extends RegExpTerm { + /** Holds if `lo` is the lower bound of this character range and `hi` the upper bound. */ + predicate isRange(string lo, string hi); + } + + /** + * A dot regular expression. + * + * Example: + * + * ``` + * . + * ``` + */ + class RegExpDot extends RegExpTerm; + + /** + * A dollar assertion `$` matching the end of a line. + * + * Example: + * + * ``` + * $ + * ``` + */ + class RegExpDollar extends RegExpTerm; + + /** + * A caret assertion `^` matching the beginning of a line. + * + * Example: + * + * ``` + * ^ + * ``` + */ + class RegExpCaret extends RegExpTerm; + + /** + * A word boundary assertion. + * + * Example: + * + * ``` + * \b + * ``` + */ + class RegExpWordBoundary extends RegExpTerm; + + /** + * A regular expression term that permits unlimited repetitions. + */ + class InfiniteRepetitionQuantifier extends RegExpQuantifier; + + /** + * Holds if the regular expression should not be considered. + * + * For javascript we make the pragmatic performance optimization to ignore minified files. + */ + predicate isExcluded(RegExpParent parent); + + /** + * Holds if `term` is a possessive quantifier. + * As javascript's regexes do not support possessive quantifiers, this never holds, but is used by the shared library. + */ + predicate isPossessive(RegExpQuantifier term); + + /** + * Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against. + * Not yet implemented for JavaScript. + */ + predicate matchesAnyPrefix(RegExpTerm term); + + /** + * Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against. + * Not yet implemented for JavaScript. + */ + predicate matchesAnySuffix(RegExpTerm term); + + /** + * Holds if `term` is an escape class representing e.g. `\d`. + * `clazz` is which character class it represents, e.g. "d" for `\d`. + */ + predicate isEscapeClass(RegExpTerm term, string clazz); + + /** + * Holds if `root` has the `i` flag for case-insensitive matching. + */ + predicate isIgnoreCase(RegExpTerm root); + + /** + * Holds if `root` has the `s` flag for multi-line matching. + */ + predicate isDotAll(RegExpTerm root); +} diff --git a/shared/regex/codeql/regex/nfa/BadTagFilterQuery.qll b/shared/regex/codeql/regex/nfa/BadTagFilterQuery.qll new file mode 100644 index 00000000000..c9c254fe990 --- /dev/null +++ b/shared/regex/codeql/regex/nfa/BadTagFilterQuery.qll @@ -0,0 +1,177 @@ +/** + * Provides predicates for reasoning about bad tag filter vulnerabilities. + */ + +private import NfaUtils as NfaUtils +private import RegexpMatching as RM +private import codeql.regex.RegexTreeView + +/** + * Module implementing classes and predicates reasoing about bad tag filter vulnerabilities. + */ +module Make{ + private import TreeImpl + import RM::Make + + /** + * Holds if the regexp `root` should be tested against `str`. + * Implements the `isRegexpMatchingCandidateSig` signature from `RegexpMatching`. + * `ignorePrefix` toggles whether the regular expression should be treated as accepting any prefix if it's unanchored. + * `testWithGroups` toggles whether it's tested which groups are filled by a given input string. + */ + private predicate isBadTagFilterCandidate( + RootTerm root, string str, boolean ignorePrefix, boolean testWithGroups + ) { + // the regexp must mention "<" and ">" explicitly. + forall(string angleBracket | angleBracket = ["<", ">"] | + any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() = + root + ) and + ignorePrefix = true and + ( + str = ["", "", "", " ", "", + "", "", " ", " ", + " ", "", + "", "", "", + "", "", + "", "", + "" + ] and + testWithGroups = false + ) + } + + /** + * A regexp that matches some string from the `isBadTagFilterCandidate` predicate. + */ + class HtmlMatchingRegExp instanceof RootTerm { + HtmlMatchingRegExp() { RegexpMatching ::matches(this, _) } + + /** Holds if this regexp matched `str`, where `str` is one of the string from `isBadTagFilterCandidate`. */ + predicate matches(string str) { RegexpMatching ::matches(this, str) } + + /** Holds if this regexp fills capture group `g' when matching `str', where `str` is one of the string from `isBadTagFilterCandidate`. */ + predicate fillsCaptureGroup(string str, int g) { + RegexpMatching ::fillsCaptureGroup(this, str, g) + } + + /** Gets a string representation of this term. */ + string toString() { result = super.toString() } + + /** Holds if this term has the specified location. */ + predicate hasLocationInfo( + string filepath, int startline, int startcolumn, int endline, int endcolumn + ) { + super.hasLocationInfo(filepath, startline, startcolumn, endline, endcolumn) + } + } + + /** + * Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match. + * + * When adding a new case to this predicate, make sure the test string used in `matches(..)` calls are present in `HTMLMatchingRegExp::test` / `HTMLMatchingRegExp::testWithGroups`. + */ + predicate isBadRegexpFilter(HtmlMatchingRegExp regexp, string msg) { + // CVE-2021-33829 - matching both "" and "", but in different capture groups + regexp.matches("") and + regexp.matches("") and + exists(int a, int b | a != b | + regexp.fillsCaptureGroup("", a) and + // might be ambiguously parsed (matching both capture groups), and that is ok here. + regexp.fillsCaptureGroup("", b) and + not regexp.fillsCaptureGroup("", a) and + msg = + "Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group " + + a + " and comments ending with --!> are matched with capture group " + + strictconcat(int i | regexp.fillsCaptureGroup("", i) | i.toString(), ", ") + + "." + ) + or + // CVE-2020-17480 - matching "" and other tags, but not "". + exists(int group, int other | + group != other and + regexp.fillsCaptureGroup("", group) and + regexp.fillsCaptureGroup(" ", other) and + not regexp.matches("") and + not regexp.fillsCaptureGroup("", any(int i | i != group)) and + not regexp.fillsCaptureGroup("", group) and + not regexp.fillsCaptureGroup(" ", group) and + not regexp.fillsCaptureGroup("") and + regexp.matches("") and + not regexp.matches(" ") and + ( + not regexp.matches("") and + msg = "This regular expression matches , but not " + or + not regexp.matches("") and + msg = "This regular expression matches , but not " + ) + or + regexp.matches("") and + regexp.matches("") and + not regexp.matches("") and + not regexp.matches(" ") and + msg = + "This regular expression does not match script tags where the attribute uses single-quotes." + or + regexp.matches("") and + regexp.matches("") and + not regexp.matches("") and + not regexp.matches(" ") and + msg = + "This regular expression does not match script tags where the attribute uses double-quotes." + or + regexp.matches("") and + regexp.matches("") and + not regexp.matches("") and + not regexp.matches(" ") and + not regexp.matches(" ") and + msg = + "This regular expression does not match script tags where tabs are used between attributes." + or + regexp.matches("") and + not isIgnoreCase(regexp) and + not regexp.matches(" ") and + not regexp.matches(" ") and + ( + not regexp.matches("") and + msg = "This regular expression does not match upper case ") and + regexp.matches("") and + msg = "This regular expression does not match mixed case ") and + not regexp.matches(" ") and + not regexp.matches(" ") and + ( + not regexp.matches("") and + msg = "This regular expression does not match script end tags like ." + or + not regexp.matches("") and + msg = "This regular expression does not match script end tags like ." + or + not regexp.matches("") and + msg = "This regular expression does not match script end tags like ." + ) + } +} diff --git a/shared/regex/codeql/regex/nfa/ExponentialBackTracking.qll b/shared/regex/codeql/regex/nfa/ExponentialBackTracking.qll new file mode 100644 index 00000000000..a2a35298b37 --- /dev/null +++ b/shared/regex/codeql/regex/nfa/ExponentialBackTracking.qll @@ -0,0 +1,355 @@ +/** + * This library implements the analysis described in the following two papers: + * + * James Kirrage, Asiri Rathnayake, Hayo Thielecke: Static Analysis for + * Regular Expression Denial-of-Service Attacks. NSS 2013. + * (http://www.cs.bham.ac.uk/~hxt/research/reg-exp-sec.pdf) + * Asiri Rathnayake, Hayo Thielecke: Static Analysis for Regular Expression + * Exponential Runtime via Substructural Logics. 2014. + * (https://www.cs.bham.ac.uk/~hxt/research/redos_full.pdf) + * + * The basic idea is to search for overlapping cycles in the NFA, that is, + * states `q` such that there are two distinct paths from `q` to itself + * that consume the same word `w`. + * + * For any such state `q`, an attack string can be constructed as follows: + * concatenate a prefix `v` that takes the NFA to `q` with `n` copies of + * the word `w` that leads back to `q` along two different paths, followed + * by a suffix `x` that is _not_ accepted in state `q`. A backtracking + * implementation will need to explore at least 2^n different ways of going + * from `q` back to itself while trying to match the `n` copies of `w` + * before finally giving up. + * + * Now in order to identify overlapping cycles, all we have to do is find + * pumpable forks, that is, states `q` that can transition to two different + * states `r1` and `r2` on the same input symbol `c`, such that there are + * paths from both `r1` and `r2` to `q` that consume the same word. The latter + * condition is equivalent to saying that `(q, q)` is reachable from `(r1, r2)` + * in the product NFA. + * + * This is what the library does. It makes a simple attempt to construct a + * prefix `v` leading into `q`, but only to improve the alert message. + * And the library tries to prove the existence of a suffix that ensures + * rejection. This check might fail, which can cause false positives. + * + * Finally, sometimes it depends on the translation whether the NFA generated + * for a regular expression has a pumpable fork or not. We implement one + * particular translation, which may result in false positives or negatives + * relative to some particular JavaScript engine. + * + * More precisely, the library constructs an NFA from a regular expression `r` + * as follows: + * + * * Every sub-term `t` gives rise to an NFA state `Match(t,i)`, representing + * the state of the automaton before attempting to match the `i`th character in `t`. + * * There is one accepting state `Accept(r)`. + * * There is a special `AcceptAnySuffix(r)` state, which accepts any suffix string + * by using an epsilon transition to `Accept(r)` and an any transition to itself. + * * Transitions between states may be labelled with epsilon, or an abstract + * input symbol. + * * Each abstract input symbol represents a set of concrete input characters: + * either a single character, a set of characters represented by a + * character class, or the set of all characters. + * * The product automaton is constructed lazily, starting with pair states + * `(q, q)` where `q` is a fork, and proceeding along an over-approximate + * step relation. + * * The over-approximate step relation allows transitions along pairs of + * abstract input symbols where the symbols have overlap in the characters they accept. + * * Once a trace of pairs of abstract input symbols that leads from a fork + * back to itself has been identified, we attempt to construct a concrete + * string corresponding to it, which may fail. + * * Lastly we ensure that any state reached by repeating `n` copies of `w` has + * a suffix `x` (possible empty) that is most likely __not__ accepted. + */ + +private import NfaUtils as NfaUtils +private import codeql.regex.RegexTreeView + +/** + * A parameterized module implementing the analysis described in the above papers. + */ +module Make { + private import TreeImpl + import NfaUtils::Make + + /** + * Holds if state `s` might be inside a backtracking repetition. + */ + pragma[noinline] + private predicate stateInsideBacktracking(State s) { + s.getRepr().getParent*() instanceof MaybeBacktrackingRepetition + } + + /** + * A infinitely repeating quantifier that might backtrack. + */ + private class MaybeBacktrackingRepetition instanceof InfiniteRepetitionQuantifier { + MaybeBacktrackingRepetition() { + exists(RegExpTerm child | + child instanceof RegExpAlt or + child instanceof RegExpQuantifier + | + child.getParent+() = this + ) + } + + string toString() { result = this.(InfiniteRepetitionQuantifier).toString() } + } + + /** + * A state in the product automaton. + */ + private newtype TStatePair = + /** + * We lazily only construct those states that we are actually + * going to need: `(q, q)` for every fork state `q`, and any + * pair of states that can be reached from a pair that we have + * already constructed. To cut down on the number of states, + * we only represent states `(q1, q2)` where `q1` is lexicographically + * no bigger than `q2`. + * + * States are only constructed if both states in the pair are + * inside a repetition that might backtrack. + */ + MkStatePair(State q1, State q2) { + isFork(q1, _, _, _, _) and q2 = q1 + or + (step(_, _, _, q1, q2) or step(_, _, _, q2, q1)) and + rankState(q1) <= rankState(q2) + } + + /** + * Gets a unique number for a `state`. + * Is used to create an ordering of states, where states with the same `toString()` will be ordered differently. + */ + private int rankState(State state) { + state = + rank[result](State s | + stateInsideBacktracking(s) + | + s order by getTermLocationString(s.getRepr()) + ) + } + + /** + * A state in the product automaton. + */ + private class StatePair extends TStatePair { + State q1; + State q2; + + StatePair() { this = MkStatePair(q1, q2) } + + /** Gets a textual representation of this element. */ + string toString() { result = "(" + q1 + ", " + q2 + ")" } + + /** Gets the first component of the state pair. */ + State getLeft() { result = q1 } + + /** Gets the second component of the state pair. */ + State getRight() { result = q2 } + } + + /** + * Holds for `(fork, fork)` state pairs when `isFork(fork, _, _, _, _)` holds. + * + * Used in `statePairDistToFork` + */ + private predicate isStatePairFork(StatePair p) { + exists(State fork | p = MkStatePair(fork, fork) and isFork(fork, _, _, _, _)) + } + + /** + * Holds if there are transitions from the components of `q` to the corresponding + * components of `r`. + * + * Used in `statePairDistToFork` + */ + private predicate reverseStep(StatePair r, StatePair q) { step(q, _, _, r) } + + /** + * Gets the minimum length of a path from `q` to `r` in the + * product automaton. + */ + private int statePairDistToFork(StatePair q, StatePair r) = + shortestDistances(isStatePairFork/1, reverseStep/2)(r, q, result) + + /** + * Holds if there are transitions from `q` to `r1` and from `q` to `r2` + * labelled with `s1` and `s2`, respectively, where `s1` and `s2` do not + * trivially have an empty intersection. + * + * This predicate only holds for states associated with regular expressions + * that have at least one repetition quantifier in them (otherwise the + * expression cannot be vulnerable to ReDoS attacks anyway). + */ + pragma[noopt] + private predicate isFork(State q, InputSymbol s1, InputSymbol s2, State r1, State r2) { + stateInsideBacktracking(q) and + exists(State q1, State q2 | + q1 = epsilonSucc*(q) and + delta(q1, s1, r1) and + q2 = epsilonSucc*(q) and + delta(q2, s2, r2) and + // Use pragma[noopt] to prevent intersect(s1,s2) from being the starting point of the join. + // From (s1,s2) it would find a huge number of intermediate state pairs (q1,q2) originating from different literals, + // and discover at the end that no `q` can reach both `q1` and `q2` by epsilon transitions. + exists(intersect(s1, s2)) + | + s1 != s2 + or + r1 != r2 + or + r1 = r2 and q1 != q2 + or + // If q can reach itself by epsilon transitions, then there are two distinct paths to the q1/q2 state: + // one that uses the loop and one that doesn't. The engine will separately attempt to match with each path, + // despite ending in the same state. The "fork" thus arises from the choice of whether to use the loop or not. + // To avoid every state in the loop becoming a fork state, + // we arbitrarily pick the InfiniteRepetitionQuantifier state as the canonical fork state for the loop + // (every epsilon-loop must contain such a state). + // + // We additionally require that the there exists another InfiniteRepetitionQuantifier `mid` on the path from `q` to itself. + // This is done to avoid flagging regular expressions such as `/(a?)*b/` - that only has polynomial runtime, and is detected by `js/polynomial-redos`. + // The below code is therefore a heuristic, that only flags regular expressions such as `/(a*)*b/`, + // and does not flag regular expressions such as `/(a?b?)c/`, but the latter pattern is not used frequently. + r1 = r2 and + q1 = q2 and + epsilonSucc+(q) = q and + exists(RegExpTerm term | term = q.getRepr() | term instanceof InfiniteRepetitionQuantifier) and + // One of the mid states is an infinite quantifier itself + exists(State mid, RegExpTerm term | + mid = epsilonSucc+(q) and + term = mid.getRepr() and + term instanceof InfiniteRepetitionQuantifier and + q = epsilonSucc+(mid) and + not mid = q + ) + ) and + stateInsideBacktracking(r1) and + stateInsideBacktracking(r2) + } + + /** + * Gets the state pair `(q1, q2)` or `(q2, q1)`; note that only + * one or the other is defined. + */ + private StatePair mkStatePair(State q1, State q2) { + result = MkStatePair(q1, q2) or result = MkStatePair(q2, q1) + } + + /** + * Holds if there are transitions from the components of `q` to the corresponding + * components of `r` labelled with `s1` and `s2`, respectively. + */ + private predicate step(StatePair q, InputSymbol s1, InputSymbol s2, StatePair r) { + exists(State r1, State r2 | step(q, s1, s2, r1, r2) and r = mkStatePair(r1, r2)) + } + + /** + * Holds if there are transitions from the components of `q` to `r1` and `r2` + * labelled with `s1` and `s2`, respectively. + * + * We only consider transitions where the resulting states `(r1, r2)` are both + * inside a repetition that might backtrack. + */ + pragma[noopt] + private predicate step(StatePair q, InputSymbol s1, InputSymbol s2, State r1, State r2) { + exists(State q1, State q2 | q.getLeft() = q1 and q.getRight() = q2 | + deltaClosed(q1, s1, r1) and + deltaClosed(q2, s2, r2) and + // use noopt to force the join on `intersect` to happen last. + exists(intersect(s1, s2)) + ) and + stateInsideBacktracking(r1) and + stateInsideBacktracking(r2) + } + + private newtype TTrace = + Nil() or + Step(InputSymbol s1, InputSymbol s2, TTrace t) { isReachableFromFork(_, _, s1, s2, t, _) } + + /** + * A list of pairs of input symbols that describe a path in the product automaton + * starting from some fork state. + */ + private class Trace extends TTrace { + /** Gets a textual representation of this element. */ + string toString() { + this = Nil() and result = "Nil()" + or + exists(InputSymbol s1, InputSymbol s2, Trace t | this = Step(s1, s2, t) | + result = "Step(" + s1 + ", " + s2 + ", " + t + ")" + ) + } + } + + /** + * Holds if `r` is reachable from `(fork, fork)` under input `w`, and there is + * a path from `r` back to `(fork, fork)` with `rem` steps. + */ + private predicate isReachableFromFork(State fork, StatePair r, Trace w, int rem) { + exists(InputSymbol s1, InputSymbol s2, Trace v | + isReachableFromFork(fork, r, s1, s2, v, rem) and + w = Step(s1, s2, v) + ) + } + + private predicate isReachableFromFork( + State fork, StatePair r, InputSymbol s1, InputSymbol s2, Trace v, int rem + ) { + // base case + exists(State q1, State q2 | + isFork(fork, s1, s2, q1, q2) and + r = MkStatePair(q1, q2) and + v = Nil() and + rem = statePairDistToFork(r, MkStatePair(fork, fork)) + ) + or + // recursive case + exists(StatePair p | + isReachableFromFork(fork, p, v, rem + 1) and + step(p, s1, s2, r) and + rem = statePairDistToFork(r, MkStatePair(fork, fork)) + ) + } + + /** + * Gets a state in the product automaton from which `(fork, fork)` is + * reachable in zero or more epsilon transitions. + */ + private StatePair getAForkPair(State fork) { + isFork(fork, _, _, _, _) and + result = MkStatePair(epsilonPred*(fork), epsilonPred*(fork)) + } + + /** An implementation of a chain containing chars for use by `Concretizer`. */ + private module CharTreeImpl implements CharTree { + class CharNode = Trace; + + CharNode getPrev(CharNode t) { t = Step(_, _, result) } + + /** Holds if `n` is a trace that is used by `concretize` in `isPumpable`. */ + predicate isARelevantEnd(CharNode n) { + exists(State f | isReachableFromFork(f, getAForkPair(f), n, _)) + } + + string getChar(CharNode t) { + exists(InputSymbol s1, InputSymbol s2 | t = Step(s1, s2, _) | result = intersect(s1, s2)) + } + } + + /** + * Holds if `fork` is a pumpable fork with word `w`. + */ + private predicate isPumpable(State fork, string w) { + exists(StatePair q, Trace t | + isReachableFromFork(fork, q, t, _) and + q = getAForkPair(fork) and + w = Concretizer ::concretize(t) + ) + } + + /** Holds if `state` has exponential ReDoS */ + predicate hasReDoSResult = ReDoSPruning ::hasReDoSResult/4; +} diff --git a/shared/regex/codeql/regex/nfa/NfaUtils.qll b/shared/regex/codeql/regex/nfa/NfaUtils.qll new file mode 100644 index 00000000000..cb5091c40aa --- /dev/null +++ b/shared/regex/codeql/regex/nfa/NfaUtils.qll @@ -0,0 +1,1382 @@ +/** + * A shared library for creating and reasoning about NFA's. + */ + +private import codeql.regex.RegexTreeView + +/** + * Classes and predicates that create an NFA and various algorithms for working with it. + */ +module Make { + private import TreeImpl + + /** + * Gets the char after `c` (from a simplified ASCII table). + */ + private string nextChar(string c) { + exists(int code | code = ascii(c) | code + 1 = ascii(result)) + } + + /** + * Gets an approximation for the ASCII code for `char`. + * Only the easily printable chars are included (so no newline, tab, null, etc). + */ + private int ascii(string char) { + char = + rank[result](string c | + c = + "! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~" + .charAt(_) + ) + } + + /** + * Holds if `t` matches at least an epsilon symbol. + * + * That is, this term does not restrict the language of the enclosing regular expression. + * + * This is implemented as an under-approximation, and this predicate does not hold for sub-patterns in particular. + */ + predicate matchesEpsilon(RegExpTerm t) { + t instanceof RegExpStar + or + t instanceof RegExpOpt + or + t.(RegExpRange).getLowerBound() = 0 + or + exists(RegExpTerm child | + child = t.getAChild() and + matchesEpsilon(child) + | + t instanceof RegExpAlt or + t instanceof RegExpGroup or + t instanceof RegExpPlus or + t instanceof RegExpRange + ) + or + matchesEpsilon(t.(RegExpBackRef).getGroup()) + or + forex(RegExpTerm child | child = t.(RegExpSequence).getAChild() | matchesEpsilon(child)) + } + + /** + * A lookahead/lookbehind that matches the empty string. + */ + class EmptyPositiveSubPattern instanceof RegExpSubPattern { + EmptyPositiveSubPattern() { + ( + this instanceof RegExpPositiveLookahead + or + this instanceof RegExpPositiveLookbehind + ) and + matchesEpsilon(this.getOperand()) + } + + /** Gets a string representation of this sub-pattern. */ + string toString() { result = super.toString() } + } + + /** DEPRECATED: Use `EmptyPositiveSubPattern` instead. */ + deprecated class EmptyPositiveSubPatttern = EmptyPositiveSubPattern; + + /** + * A branch in a disjunction that is the root node in a literal, or a literal + * whose root node is not a disjunction. + */ + class RegExpRoot instanceof RegExpTerm { + RegExpRoot() { + exists(RegExpParent parent | + exists(RegExpAlt alt | + alt.isRootTerm() and + this = alt.getAChild() and + parent = alt.getParent() + ) + or + this.isRootTerm() and + not this instanceof RegExpAlt and + parent = this.getParent() + ) + } + + /** + * Holds if this root term is relevant to the ReDoS analysis. + */ + predicate isRelevant() { + // is actually used as a RegExp + super.isUsedAsRegExp() and + // not excluded for library specific reasons + not isExcluded(super.getRootTerm().getParent()) + } + + /** Gets a string representation of this root term. */ + string toString() { result = this.(RegExpTerm).toString() } + + /** Gets the outermost term of this regular expression. */ + RegExpTerm getRootTerm() { result = super.getRootTerm() } + } + + /** + * A constant in a regular expression that represents valid Unicode character(s). + */ + private class RegexpCharacterConstant instanceof RegExpConstant { + RegexpCharacterConstant() { this.isCharacter() } + + string toString() { result = this.(RegExpConstant).toString() } + + RegExpTerm getRootTerm() { result = this.(RegExpConstant).getRootTerm() } + + string getValue() { result = this.(RegExpConstant).getValue() } + } + + /** + * A regexp term that is relevant for this ReDoS analysis. + */ + class RelevantRegExpTerm instanceof RegExpTerm { + RelevantRegExpTerm() { getRoot(this).isRelevant() } + + /** Gets a string representation of this term. */ + string toString() { result = super.toString() } + + /** Gets the raw source text of this term. */ + string getRawValue() { result = super.getRawValue() } + + /** Gets the outermost term of this regular expression. */ + RegExpTerm getRootTerm() { result = super.getRootTerm() } + } + + /** + * Gets a string for the full location of `t`. + */ + string getTermLocationString(RegExpTerm t) { + exists(string file, int startLine, int startColumn, int endLine, int endColumn | + t.hasLocationInfo(file, startLine, startColumn, endLine, endColumn) and + result = file + ":" + startLine + ":" + startColumn + "-" + endLine + ":" + endColumn + ) + } + + /** + * Holds if `term` is the chosen canonical representative for all terms with string representation `str`. + * The string representation includes which flags are used with the regular expression. + * + * Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s. + * The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks. + */ + private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) { + term = + min(RelevantRegExpTerm t | + str = getCanonicalizationString(t) + | + t order by getTermLocationString(t), t.toString() + ) + } + + /** + * Gets a string representation of `term` that is used for canonicalization. + */ + private string getCanonicalizationString(RelevantRegExpTerm term) { + exists(string ignoreCase | + (if isIgnoreCase(term.getRootTerm()) then ignoreCase = "i" else ignoreCase = "") and + result = term.getRawValue() + "|" + ignoreCase + ) + } + + /** + * An abstract input symbol, representing a set of concrete characters. + */ + private newtype TInputSymbol = + /** An input symbol corresponding to character `c`. */ + Char(string c) { + c = + any(RegexpCharacterConstant cc | + cc instanceof RelevantRegExpTerm and + not isIgnoreCase(cc.getRootTerm()) + ).getValue().charAt(_) + or + // normalize everything to lower case if the regexp is case insensitive + c = + any(RegexpCharacterConstant cc, string char | + cc instanceof RelevantRegExpTerm and + isIgnoreCase(cc.getRootTerm()) and + char = cc.getValue().charAt(_) + | + char.toLowerCase() + ) + } or + /** + * An input symbol representing all characters matched by + * a (non-universal) character class that has string representation `charClassString`. + */ + CharClass(string charClassString) { + exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) | + recc instanceof RegExpCharacterClass and + not recc.(RegExpCharacterClass).isUniversalClass() + or + isEscapeClass(recc, _) + ) + } or + /** An input symbol representing all characters matched by `.`. */ + Dot() or + /** An input symbol representing all characters. */ + Any() or + /** An epsilon transition in the automaton. */ + Epsilon() + + /** + * Gets the the CharClass corresponding to the canonical representative `term`. + */ + private CharClass getCharClassForCanonicalTerm(RegExpTerm term) { + exists(string str | isCanonicalTerm(term, str) | result = CharClass(str)) + } + + /** + * Gets a char class that represents `term`, even when `term` is not the canonical representative. + */ + CharacterClass getCanonicalCharClass(RegExpTerm term) { + exists(string str | str = getCanonicalizationString(term) and result = CharClass(str)) + } + + /** + * Holds if `a` and `b` are input symbols from the same regexp. + */ + private predicate sharesRoot(InputSymbol a, InputSymbol b) { + exists(RegExpRoot root | + belongsTo(a, root) and + belongsTo(b, root) + ) + } + + /** + * Holds if the `a` is an input symbol from a regexp that has root `root`. + */ + private predicate belongsTo(InputSymbol a, RegExpRoot root) { + exists(State s | getRoot(s.getRepr()) = root | + delta(s, a, _) + or + delta(_, a, s) + ) + } + + /** + * An abstract input symbol, representing a set of concrete characters. + */ + class InputSymbol extends TInputSymbol { + InputSymbol() { not this instanceof Epsilon } + + /** + * Gets a string representation of this input symbol. + */ + string toString() { + this = Char(result) + or + this = CharClass(result) + or + this = Dot() and result = "." + or + this = Any() and result = "[^]" + } + } + + /** + * An abstract input symbol that represents a character class. + */ + abstract class CharacterClass extends InputSymbol { + /** + * Gets a character that is relevant for intersection-tests involving this + * character class. + * + * Specifically, this is any of the characters mentioned explicitly in the + * character class, offset by one if it is inverted. For character class escapes, + * the result is as if the class had been written out as a series of intervals. + * + * This set is large enough to ensure that for any two intersecting character + * classes, one contains a relevant character from the other. + */ + abstract string getARelevantChar(); + + /** + * Holds if this character class matches `char`. + */ + bindingset[char] + abstract predicate matches(string char); + + /** + * Gets a character matched by this character class. + */ + string choose() { result = this.getARelevantChar() and this.matches(result) } + } + + /** + * Provides implementations for `CharacterClass`. + */ + private module CharacterClasses { + /** + * Holds if the character class `cc` has a child (constant or range) that matches `char`. + */ + pragma[noinline] + predicate hasChildThatMatches(RegExpCharacterClass cc, string char) { + if isIgnoreCase(cc.getRootTerm()) + then + // normalize everything to lower case if the regexp is case insensitive + exists(string c | hasChildThatMatchesIgnoringCasingFlags(cc, c) | char = c.toLowerCase()) + else hasChildThatMatchesIgnoringCasingFlags(cc, char) + } + + /** + * Holds if the character class `cc` has a child (constant or range) that matches `char`. + * Ignores whether the character class is inside a regular expression that has the ignore case flag. + */ + pragma[noinline] + predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) { + exists(getCharClassForCanonicalTerm(cc)) and + exists(RegExpTerm child | child = cc.getAChild() | + char = child.(RegexpCharacterConstant).getValue() + or + rangeMatchesOnLetterOrDigits(child, char) + or + not rangeMatchesOnLetterOrDigits(child, _) and + char = getARelevantChar() and + exists(string lo, string hi | child.(RegExpCharacterRange).isRange(lo, hi) | + lo <= char and + char <= hi + ) + or + exists(string charClass | isEscapeClass(child, charClass) | + charClass.toLowerCase() = charClass and + classEscapeMatches(charClass, char) + or + char = getARelevantChar() and + charClass.toUpperCase() = charClass and + not classEscapeMatches(charClass, char) + ) + ) + } + + /** + * Holds if `range` is a range on lower-case, upper-case, or digits, and matches `char`. + * This predicate is used to restrict the searchspace for ranges by only joining `getAnyPossiblyMatchedChar` + * on a few ranges. + */ + private predicate rangeMatchesOnLetterOrDigits(RegExpCharacterRange range, string char) { + exists(string lo, string hi | + range.isRange(lo, hi) and lo = lowercaseLetter() and hi = lowercaseLetter() + | + lo <= char and + char <= hi and + char = lowercaseLetter() + ) + or + exists(string lo, string hi | + range.isRange(lo, hi) and lo = upperCaseLetter() and hi = upperCaseLetter() + | + lo <= char and + char <= hi and + char = upperCaseLetter() + ) + or + exists(string lo, string hi | range.isRange(lo, hi) and lo = digit() and hi = digit() | + lo <= char and + char <= hi and + char = digit() + ) + } + + private string lowercaseLetter() { result = "abcdefghijklmnopqrstuvwxyz".charAt(_) } + + private string upperCaseLetter() { result = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".charAt(_) } + + private string digit() { result = [0 .. 9].toString() } + + /** + * Gets a char that could be matched by a regular expression. + * Includes all printable ascii chars, all constants mentioned in a regexp, and all chars matches by the regexp `/\s|\d|\w/`. + */ + string getARelevantChar() { + exists(ascii(result)) + or + exists(RegexpCharacterConstant c | result = c.getValue().charAt(_)) + or + classEscapeMatches(_, result) + } + + /** + * Gets a char that is mentioned in the character class `c`. + */ + private string getAMentionedChar(RegExpCharacterClass c) { + exists(RegExpTerm child | child = c.getAChild() | + result = child.(RegexpCharacterConstant).getValue() + or + child.(RegExpCharacterRange).isRange(result, _) + or + child.(RegExpCharacterRange).isRange(_, result) + or + exists(string charClass | isEscapeClass(child, charClass) | + result = min(string s | classEscapeMatches(charClass.toLowerCase(), s)) + or + result = max(string s | classEscapeMatches(charClass.toLowerCase(), s)) + ) + ) + } + + bindingset[char, cc] + private string caseNormalize(string char, RegExpTerm cc) { + if isIgnoreCase(cc.getRootTerm()) then result = char.toLowerCase() else result = char + } + + /** + * An implementation of `CharacterClass` for positive (non inverted) character classes. + */ + private class PositiveCharacterClass extends CharacterClass { + RegExpCharacterClass cc; + + PositiveCharacterClass() { this = getCharClassForCanonicalTerm(cc) and not cc.isInverted() } + + override string getARelevantChar() { result = caseNormalize(getAMentionedChar(cc), cc) } + + override predicate matches(string char) { hasChildThatMatches(cc, char) } + } + + /** + * An implementation of `CharacterClass` for inverted character classes. + */ + private class InvertedCharacterClass extends CharacterClass { + RegExpCharacterClass cc; + + InvertedCharacterClass() { this = getCharClassForCanonicalTerm(cc) and cc.isInverted() } + + override string getARelevantChar() { + result = nextChar(caseNormalize(getAMentionedChar(cc), cc)) or + nextChar(result) = caseNormalize(getAMentionedChar(cc), cc) + } + + bindingset[char] + override predicate matches(string char) { not hasChildThatMatches(cc, char) } + } + + /** + * Holds if the character class escape `clazz` (\d, \s, or \w) matches `char`. + */ + pragma[noinline] + private predicate classEscapeMatches(string clazz, string char) { + clazz = "d" and + char = "0123456789".charAt(_) + or + clazz = "s" and + char = [" ", "\t", "\r", "\n", 11.toUnicode(), 12.toUnicode()] // 11.toUnicode() = \v, 12.toUnicode() = \f + or + clazz = "w" and + char = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_".charAt(_) + } + + /** + * An implementation of `CharacterClass` for \d, \s, and \w. + */ + private class PositiveCharacterClassEscape extends CharacterClass { + string charClass; + RegExpTerm cc; + + PositiveCharacterClassEscape() { + isEscapeClass(cc, charClass) and + this = getCharClassForCanonicalTerm(cc) and + charClass = ["d", "s", "w"] + } + + override string getARelevantChar() { + charClass = "d" and + result = ["0", "9"] + or + charClass = "s" and + result = " " + or + charClass = "w" and + if isIgnoreCase(cc.getRootTerm()) + then result = ["a", "z", "_", "0", "9"] + else result = ["a", "Z", "_", "0", "9"] + } + + override predicate matches(string char) { classEscapeMatches(charClass, char) } + + override string choose() { + charClass = "d" and + result = "9" + or + charClass = "s" and + result = " " + or + charClass = "w" and + result = "a" + } + } + + /** + * An implementation of `CharacterClass` for \D, \S, and \W. + */ + private class NegativeCharacterClassEscape extends CharacterClass { + string charClass; + + NegativeCharacterClassEscape() { + exists(RegExpTerm cc | + isEscapeClass(cc, charClass) and + this = getCharClassForCanonicalTerm(cc) and + charClass = ["D", "S", "W"] + ) + } + + override string getARelevantChar() { + charClass = "D" and + result = ["a", "Z", "!"] + or + charClass = "S" and + result = ["a", "9", "!"] + or + charClass = "W" and + result = [" ", "!"] + } + + bindingset[char] + override predicate matches(string char) { + not classEscapeMatches(charClass.toLowerCase(), char) + } + } + + /** Gets a representative for all char classes that match the same chars as `c`. */ + CharacterClass normalize(CharacterClass c) { + exists(string normalization | + normalization = getNormalizationString(c) and + result = + min(CharacterClass cc, string raw | + getNormalizationString(cc) = normalization and cc = CharClass(raw) + | + cc order by raw + ) + ) + } + + /** Gets a string representing all the chars matched by `c` */ + private string getNormalizationString(CharacterClass c) { + (c instanceof PositiveCharacterClass or c instanceof PositiveCharacterClassEscape) and + result = concat(string char | c.matches(char) and char = CharacterClasses::getARelevantChar()) + or + (c instanceof InvertedCharacterClass or c instanceof NegativeCharacterClassEscape) and + // the string produced by the concat can not contain repeated chars + // so by starting the below with "nn" we can guarantee that + // it will not overlap with the above case. + // and a negative char class can never match the same chars as a positive one, so we don't miss any results from this. + result = + "nn:" + + concat(string char | not c.matches(char) and char = CharacterClasses::getARelevantChar()) + } + } + + private class EdgeLabel extends TInputSymbol { + string toString() { + this = Epsilon() and result = "" + or + exists(InputSymbol s | this = s and result = s.toString()) + } + } + + /** + * A RegExp term that acts like a plus. + * Either it's a RegExpPlus, or it is a range {1,X} where X is >= 30. + * 30 has been chosen as a threshold because for exponential blowup 2^30 is enough to get a decent DOS attack. + */ + private class EffectivelyPlus instanceof RegExpTerm { + EffectivelyPlus() { + this instanceof RegExpPlus + or + exists(RegExpRange range | + range.getLowerBound() = 1 and + (range.getUpperBound() >= 30 or not exists(range.getUpperBound())) + | + this = range + ) + } + + string toString() { result = this.(RegExpTerm).toString() } + + RegExpTerm getAChild() { result = this.(RegExpTerm).getChild(_) } + + RegExpTerm getChild(int i) { result = this.(RegExpTerm).getChild(i) } + } + + /** + * A RegExp term that acts like a star. + * Either it's a RegExpStar, or it is a range {0,X} where X is >= 30. + */ + private class EffectivelyStar instanceof RegExpTerm { + EffectivelyStar() { + this instanceof RegExpStar + or + exists(RegExpRange range | + range.getLowerBound() = 0 and + (range.getUpperBound() >= 30 or not exists(range.getUpperBound())) + | + this = range + ) + } + + string toString() { result = this.(RegExpTerm).toString() } + + RegExpTerm getAChild() { result = this.(RegExpTerm).getAChild() } + + RegExpTerm getChild(int i) { result = this.(RegExpTerm).getChild(i) } + } + + /** + * A RegExp term that acts like a question mark. + * Either it's a RegExpQuestion, or it is a range {0,1}. + */ + private class EffectivelyQuestion instanceof RegExpTerm { + EffectivelyQuestion() { + this instanceof RegExpOpt + or + exists(RegExpRange range | range.getLowerBound() = 0 and range.getUpperBound() = 1 | + this = range + ) + } + + string toString() { result = this.(RegExpTerm).toString() } + + RegExpTerm getAChild() { result = this.(RegExpTerm).getAChild() } + + RegExpTerm getChild(int i) { result = this.(RegExpTerm).getChild(i) } + } + + /** + * Gets the state before matching `t`. + */ + pragma[inline] + private State before(RegExpTerm t) { result = Match(t, 0) } + + /** + * Gets a state the NFA may be in after matching `t`. + */ + State after(RegExpTerm t) { + exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt)) + or + exists(RegExpSequence seq, int i | t = seq.getChild(i) | + result = before(seq.getChild(i + 1)) + or + i + 1 = seq.getNumChild() and result = after(seq) + ) + or + exists(RegExpGroup grp | t = grp.getAChild() | result = after(grp)) + or + exists(EffectivelyStar star | t = star.getAChild() | + not isPossessive(star) and + result = before(star) + ) + or + exists(EffectivelyPlus plus | t = plus.getAChild() | + not isPossessive(plus) and + result = before(plus) + or + result = after(plus) + ) + or + exists(EffectivelyQuestion opt | t = opt.getAChild() | result = after(opt)) + or + exists(RegExpRoot root | t = root | + if matchesAnySuffix(root) then result = AcceptAnySuffix(root) else result = Accept(root) + ) + } + + /** + * Holds if the NFA has a transition from `q1` to `q2` labelled with `lbl`. + */ + predicate delta(State q1, EdgeLabel lbl, State q2) { + exists(RegexpCharacterConstant s, int i | + q1 = Match(s, i) and + ( + not isIgnoreCase(s.getRootTerm()) and + lbl = Char(s.getValue().charAt(i)) + or + // normalize everything to lower case if the regexp is case insensitive + isIgnoreCase(s.getRootTerm()) and + exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase())) + ) and + ( + q2 = Match(s, i + 1) + or + s.getValue().length() = i + 1 and + q2 = after(s) + ) + ) + or + exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) | + if isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot() + ) + or + exists(RegExpCharacterClass cc | + cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc) + or + q1 = before(cc) and + lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and + q2 = after(cc) + ) + or + exists(RegExpTerm cc | isEscapeClass(cc, _) | + q1 = before(cc) and + lbl = CharacterClasses::normalize(CharClass(getCanonicalizationString(cc))) and + q2 = after(cc) + ) + or + exists(RegExpAlt alt | lbl = Epsilon() | q1 = before(alt) and q2 = before(alt.getAChild())) + or + exists(RegExpSequence seq | lbl = Epsilon() | q1 = before(seq) and q2 = before(seq.getChild(0))) + or + exists(RegExpGroup grp | lbl = Epsilon() | q1 = before(grp) and q2 = before(grp.getChild(0))) + or + exists(EffectivelyStar star | lbl = Epsilon() | + q1 = before(star) and q2 = before(star.getChild(0)) + or + q1 = before(star) and q2 = after(star) + ) + or + exists(EffectivelyPlus plus | lbl = Epsilon() | + q1 = before(plus) and q2 = before(plus.getChild(0)) + ) + or + exists(EffectivelyQuestion opt | lbl = Epsilon() | + q1 = before(opt) and q2 = before(opt.getChild(0)) + or + q1 = before(opt) and q2 = after(opt) + ) + or + exists(RegExpRoot root | q1 = AcceptAnySuffix(root) | + lbl = Any() and q2 = q1 + or + lbl = Epsilon() and q2 = Accept(root) + ) + or + exists(RegExpRoot root | q1 = Match(root, 0) | + matchesAnyPrefix(root) and lbl = Any() and q2 = q1 + ) + or + exists(RegExpDollar dollar | q1 = before(dollar) | + lbl = Epsilon() and q2 = Accept(getRoot(dollar)) + ) + or + exists(EmptyPositiveSubPattern empty | q1 = before(empty) | + lbl = Epsilon() and q2 = after(empty) + ) + } + + /** + * Gets a state that `q` has an epsilon transition to. + */ + State epsilonSucc(State q) { delta(q, Epsilon(), result) } + + /** + * Gets a state that has an epsilon transition to `q`. + */ + State epsilonPred(State q) { q = epsilonSucc(result) } + + /** + * Holds if there is a state `q` that can be reached from `q1` + * along epsilon edges, such that there is a transition from + * `q` to `q2` that consumes symbol `s`. + */ + predicate deltaClosed(State q1, InputSymbol s, State q2) { delta(epsilonSucc*(q1), s, q2) } + + /** + * Gets the root containing the given term, that is, the root of the literal, + * or a branch of the root disjunction. + */ + RegExpRoot getRoot(RegExpTerm term) { + result = term or + result = getRoot(term.getParent()) + } + + /** + * A state in the NFA. + */ + newtype TState = + /** + * A state representing that the NFA is about to match a term. + * `i` is used to index into multi-char literals. + */ + Match(RelevantRegExpTerm t, int i) { + i = 0 + or + exists(t.(RegexpCharacterConstant).getValue().charAt(i)) + } or + /** + * An accept state, where exactly the given input string is accepted. + */ + Accept(RegExpRoot l) { l.isRelevant() } or + /** + * An accept state, where the given input string, or any string that has this + * string as a prefix, is accepted. + */ + AcceptAnySuffix(RegExpRoot l) { l.isRelevant() } + + /** + * Gets a state that is about to match the regular expression `t`. + */ + State mkMatch(RegExpTerm t) { result = Match(t, 0) } + + /** + * A state in the NFA corresponding to a regular expression. + * + * Each regular expression literal `l` has one accepting state + * `Accept(l)`, one state that accepts all suffixes `AcceptAnySuffix(l)`, + * and a state `Match(t, i)` for every subterm `t`, + * which represents the state of the NFA before starting to + * match `t`, or the `i`th character in `t` if `t` is a constant. + */ + class State extends TState { + RegExpTerm repr; + + State() { + this = Match(repr, _) or + this = Accept(repr) or + this = AcceptAnySuffix(repr) + } + + /** + * Gets a string representation for this state in a regular expression. + */ + string toString() { + exists(int i | this = Match(repr, i) | result = "Match(" + repr + "," + i + ")") + or + this instanceof Accept and + result = "Accept(" + repr + ")" + or + this instanceof AcceptAnySuffix and + result = "AcceptAny(" + repr + ")" + } + + /** + * Gets the term represented by this state. + */ + RegExpTerm getRepr() { result = repr } + } + + /** + * Gets the minimum char that is matched by both the character classes `c` and `d`. + */ + private string getMinOverlapBetweenCharacterClasses(CharacterClass c, CharacterClass d) { + result = min(getAOverlapBetweenCharacterClasses(c, d)) + } + + /** + * Gets a char that is matched by both the character classes `c` and `d`. + * And `c` and `d` is not the same character class. + */ + private string getAOverlapBetweenCharacterClasses(CharacterClass c, CharacterClass d) { + sharesRoot(c, d) and + result = [c.getARelevantChar(), d.getARelevantChar()] and + c.matches(result) and + d.matches(result) and + not c = d + } + + /** + * Gets a character that is represented by both `c` and `d`. + */ + string intersect(InputSymbol c, InputSymbol d) { + (sharesRoot(c, d) or [c, d] = Any()) and + ( + c = Char(result) and + d = getAnInputSymbolMatching(result) + or + result = getMinOverlapBetweenCharacterClasses(c, d) + or + result = c.(CharacterClass).choose() and + ( + d = c + or + d = Dot() and + not (result = "\n" or result = "\r") + or + d = Any() + ) + or + (c = Dot() or c = Any()) and + (d = Dot() or d = Any()) and + result = "a" + ) + or + result = intersect(d, c) + } + + /** + * Gets a symbol that matches `char`. + */ + bindingset[char] + InputSymbol getAnInputSymbolMatching(string char) { + result = Char(char) + or + result.(CharacterClass).matches(char) + or + result = Dot() and + not (char = "\n" or char = "\r") + or + result = Any() + } + + /** + * Holds if `state` is a start state. + */ + predicate isStartState(State state) { + state = mkMatch(any(RegExpRoot r)) + or + exists(RegExpCaret car | state = after(car)) + } + + /** + * Holds if `state` is a candidate for ReDoS with string `pump`. + */ + signature predicate isCandidateSig(State state, string pump); + + /** + * Holds if `state` is a candidate for ReDoS. + */ + signature predicate isCandidateSig(State state); + + /** + * Predicates for constructing a prefix string that leads to a given state. + */ + module PrefixConstruction { + /** + * Holds if `state` is the textually last start state for the regular expression. + */ + private predicate lastStartState(RelevantState state) { + exists(RegExpRoot root | + state = + max(RelevantState s | + isStartState(s) and + getRoot(s.getRepr()) = root + | + s order by getTermLocationString(s.getRepr()), s.getRepr().toString() + ) + ) + } + + /** + * Holds if there exists any transition (Epsilon() or other) from `a` to `b`. + */ + private predicate existsTransition(State a, State b) { delta(a, _, b) } + + /** + * Gets the minimum number of transitions it takes to reach `state` from the `start` state. + */ + int prefixLength(State start, State state) = + shortestDistances(lastStartState/1, existsTransition/2)(start, state, result) + + /** + * Gets the minimum number of transitions it takes to reach `state` from the start state. + */ + private int lengthFromStart(State state) { result = prefixLength(_, state) } + + /** + * Gets a string for which the regular expression will reach `state`. + * + * Has at most one result for any given `state`. + * This predicate will not always have a result even if there is a ReDoS issue in + * the regular expression. + */ + string prefix(State state) { + lastStartState(state) and + result = "" + or + // the search stops past the last redos candidate state. + lengthFromStart(state) <= max(lengthFromStart(any(State s | isCandidate(s)))) and + exists(State prev | + // select a unique predecessor (by an arbitrary measure) + prev = + min(State s | + lengthFromStart(s) = lengthFromStart(state) - 1 and + delta(s, _, state) + | + s order by getTermLocationString(s.getRepr()), s.getRepr().toString() + ) + | + // greedy search for the shortest prefix + result = prefix(prev) and delta(prev, Epsilon(), state) + or + not delta(prev, Epsilon(), state) and + result = prefix(prev) + getCanonicalEdgeChar(prev, state) + ) + } + + /** + * Gets a canonical char for which there exists a transition from `prev` to `next` in the NFA. + */ + private string getCanonicalEdgeChar(State prev, State next) { + result = + min(string c | delta(prev, any(InputSymbol symbol | c = intersect(Any(), symbol)), next)) + } + + /** A state within a regular expression that contains a candidate state. */ + class RelevantState instanceof State { + RelevantState() { + exists(State s | isCandidate(s) | getRoot(s.getRepr()) = getRoot(this.getRepr())) + } + + /** Gets a string representation for this state in a regular expression. */ + string toString() { result = State.super.toString() } + + /** Gets the term represented by this state. */ + RegExpTerm getRepr() { result = State.super.getRepr() } + } + } + + /** + * A module for pruning candidate ReDoS states. + * The candidates are specified by the `isCandidate` signature predicate. + * The candidates are checked for rejecting suffixes and deduplicated, + * and the resulting ReDoS states are read by the `hasReDoSResult` predicate. + */ + module ReDoSPruning { + /** + * Holds if repeating `pump` starting at `state` is a candidate for causing backtracking. + * No check whether a rejected suffix exists has been made. + */ + private predicate isReDoSCandidate(State state, string pump) { + isCandidate(state, pump) and + not state = acceptsAnySuffix() and // pruning early - these can never get stuck in a rejecting state. + ( + not isCandidate(epsilonSucc+(state), _) + or + epsilonSucc+(state) = state and + state = + max(State s | + s = epsilonSucc+(state) and + isCandidate(s, _) and + s.getRepr() instanceof InfiniteRepetitionQuantifier + | + s order by getTermLocationString(s.getRepr()), s.getRepr().toString() + ) + ) + } + + /** Gets a state that can reach the `accept-any` state using only epsilon steps. */ + private State acceptsAnySuffix() { epsilonSucc*(result) = AcceptAnySuffix(_) } + + private predicate isCandidateState(State s) { isReDoSCandidate(s, _) } + + import PrefixConstruction as Prefix + + class RelevantState = Prefix::RelevantState; + + /** + * Predicates for testing the presence of a rejecting suffix. + * + * These predicates are used to ensure that the all states reached from the fork + * by repeating `w` have a rejecting suffix. + * + * For example, a regexp like `/^(a+)+/` will accept any string as long the prefix is + * some number of `"a"`s, and it is therefore not possible to construct a rejecting suffix. + * + * A regexp like `/(a+)+$/` or `/(a+)+b/` trivially has a rejecting suffix, + * as the suffix "X" will cause both the regular expressions to be rejected. + * + * The string `w` is repeated any number of times because it needs to be + * infinitely repeatable for the attack to work. + * For the regular expression `/((ab)+)*abab/` the accepting state is not reachable from the fork + * using epsilon transitions. But any attempt at repeating `w` will end in a state that accepts all suffixes. + */ + private module SuffixConstruction { + /** + * Holds if all states reachable from `fork` by repeating `w` + * are likely rejectable by appending some suffix. + */ + predicate reachesOnlyRejectableSuffixes(State fork, string w) { + isReDoSCandidate(fork, w) and + forex(State next | next = process(fork, w, w.length() - 1) | isLikelyRejectable(next)) and + not getProcessPrevious(fork, _, w) = acceptsAnySuffix() // we stop `process(..)` early if we can, check here if it happened. + } + + /** + * Holds if there likely exists a suffix starting from `s` that leads to the regular expression being rejected. + * This predicate might find impossible suffixes when searching for suffixes of length > 1, which can cause FPs. + */ + pragma[noinline] + private predicate isLikelyRejectable(RelevantState s) { + // exists a reject edge with some char. + hasRejectEdge(s) + or + hasEdgeToLikelyRejectable(s) + or + // stopping here is rejection + isRejectState(s) + } + + /** + * Holds if `s` is not an accept state, and there is no epsilon transition to an accept state. + */ + predicate isRejectState(RelevantState s) { not epsilonSucc*(s) = Accept(_) } + + /** + * Holds if there is likely a non-empty suffix leading to rejection starting in `s`. + */ + pragma[noopt] + predicate hasEdgeToLikelyRejectable(RelevantState s) { + // all edges (at least one) with some char leads to another state that is rejectable. + // the `next` states might not share a common suffix, which can cause FPs. + exists(string char | char = hasEdgeToLikelyRejectableHelper(s) | + // noopt to force `hasEdgeToLikelyRejectableHelper` to be first in the join-order. + exists(State next | deltaClosedChar(s, char, next) | isLikelyRejectable(next)) and + forall(State next | deltaClosedChar(s, char, next) | isLikelyRejectable(next)) + ) + } + + /** + * Gets a char for there exists a transition away from `s`, + * and `s` has not been found to be rejectable by `hasRejectEdge` or `isRejectState`. + */ + pragma[noinline] + private string hasEdgeToLikelyRejectableHelper(RelevantState s) { + not hasRejectEdge(s) and + not isRejectState(s) and + deltaClosedChar(s, result, _) + } + + /** + * Holds if there is a state `next` that can be reached from `prev` + * along epsilon edges, such that there is a transition from + * `prev` to `next` that the character symbol `char`. + */ + predicate deltaClosedChar(RelevantState prev, string char, RelevantState next) { + deltaClosed(prev, getAnInputSymbolMatchingRelevant(char), next) + } + + pragma[noinline] + InputSymbol getAnInputSymbolMatchingRelevant(string char) { + char = relevant(_) and + result = getAnInputSymbolMatching(char) + } + + pragma[noinline] + RegExpRoot relevantRoot() { + exists(RegExpTerm term, State s | + s.getRepr() = term and isCandidateState(s) and result = term.getRootTerm() + ) + } + + /** + * Gets a char used for finding possible suffixes inside `root`. + */ + pragma[noinline] + private string relevant(RegExpRoot root) { + root = relevantRoot() and + ( + exists(ascii(result)) and exists(root) + or + exists(InputSymbol s | belongsTo(s, root) | result = intersect(s, _)) + or + // The characters from `hasSimpleRejectEdge`. Only `\n` is really needed (as `\n` is not in the `ascii` relation). + // The three chars must be kept in sync with `hasSimpleRejectEdge`. + result = ["|", "\n", "Z"] and exists(root) + ) + } + + /** + * Holds if there exists a `char` such that there is no edge from `s` labeled `char` in our NFA. + * The NFA does not model reject states, so the above is the same as saying there is a reject edge. + */ + private predicate hasRejectEdge(State s) { + hasSimpleRejectEdge(s) + or + not hasSimpleRejectEdge(s) and + exists(string char | char = relevant(getRoot(s.getRepr())) | + not deltaClosedChar(s, char, _) + ) + } + + /** + * Holds if there is no edge from `s` labeled with "|", "\n", or "Z" in our NFA. + * This predicate is used as a cheap pre-processing to speed up `hasRejectEdge`. + */ + private predicate hasSimpleRejectEdge(State s) { + // The three chars were chosen arbitrarily. The three chars must be kept in sync with `relevant`. + exists(string char | char = ["|", "\n", "Z"] | not deltaClosedChar(s, char, _)) + } + + /** + * Gets a state that can be reached from pumpable `fork` consuming all + * chars in `w` any number of times followed by the first `i+1` characters of `w`. + */ + pragma[noopt] + private State process(State fork, string w, int i) { + exists(State prev | prev = getProcessPrevious(fork, i, w) | + not prev = acceptsAnySuffix() and // we stop `process(..)` early if we can. If the successor accepts any suffix, then we know it can never be rejected. + exists(string char, InputSymbol sym | + char = w.charAt(i) and + deltaClosed(prev, sym, result) and + // noopt to prevent joining `prev` with all possible `chars` that could transition away from `prev`. + // Instead only join with the set of `chars` where a relevant `InputSymbol` has already been found. + sym = getAProcessInputSymbol(char) + ) + ) + } + + /** + * Gets a state that can be reached from pumpable `fork` consuming all + * chars in `w` any number of times followed by the first `i` characters of `w`. + */ + private State getProcessPrevious(State fork, int i, string w) { + isReDoSCandidate(fork, w) and + ( + i = 0 and result = fork + or + result = process(fork, w, i - 1) + or + // repeat until fixpoint + i = 0 and + result = process(fork, w, w.length() - 1) + ) + } + + /** + * Gets an InputSymbol that matches `char`. + * The predicate is specialized to only have a result for the `char`s that are relevant for the `process` predicate. + */ + private InputSymbol getAProcessInputSymbol(string char) { + char = getAProcessChar() and + result = getAnInputSymbolMatching(char) + } + + /** + * Gets a `char` that occurs in a `pump` string. + */ + private string getAProcessChar() { result = any(string s | isReDoSCandidate(_, s)).charAt(_) } + } + + /** + * Holds if `term` may cause superlinear backtracking on strings containing many repetitions of `pump`. + * Gets the shortest string that causes superlinear backtracking. + */ + private predicate isReDoSAttackable(RegExpTerm term, string pump, State s) { + exists(int i, string c | s = Match(term, i) | + c = + min(string w | + isCandidate(s, w) and + SuffixConstruction::reachesOnlyRejectableSuffixes(s, w) + | + w order by w.length(), w + ) and + pump = escape(rotate(c, i)) + ) + } + + /** + * Holds if the state `s` (represented by the term `t`) can have backtracking with repetitions of `pump`. + * + * `prefixMsg` contains a friendly message for a prefix that reaches `s` (or `prefixMsg` is the empty string if the prefix is empty or if no prefix could be found). + */ + predicate hasReDoSResult(RegExpTerm t, string pump, State s, string prefixMsg) { + isReDoSAttackable(t, pump, s) and + ( + prefixMsg = "starting with '" + escape(Prefix::prefix(s)) + "' and " and + not Prefix::prefix(s) = "" + or + Prefix::prefix(s) = "" and prefixMsg = "" + or + not exists(Prefix::prefix(s)) and prefixMsg = "" + ) + } + + /** + * Gets the result of backslash-escaping newlines, carriage-returns and + * backslashes in `s`. + */ + bindingset[s] + private string escape(string s) { + result = + s.replaceAll("\\", "\\\\") + .replaceAll("\n", "\\n") + .replaceAll("\r", "\\r") + .replaceAll("\t", "\\t") + } + + /** + * Gets `str` with the last `i` characters moved to the front. + * + * We use this to adjust the pump string to match with the beginning of + * a RegExpTerm, so it doesn't start in the middle of a constant. + */ + bindingset[str, i] + private string rotate(string str, int i) { + result = str.suffix(str.length() - i) + str.prefix(str.length() - i) + } + } + + /** + * A module that describes a tree where each node has one or more associated characters, also known as a trie. + * The root node has no associated character. + * This module is a signature used in `Concretizer`. + */ + signature module CharTree { + /** A node in the tree. */ + class CharNode; + + /** Gets the previous node in the tree from `t`. */ + CharNode getPrev(CharNode t); + + /** + * Holds if `n` is at the end of a tree. I.e. a node that should have a result in the `Concretizer` module. + * Such a node can still have children. + */ + predicate isARelevantEnd(CharNode n); + + /** Gets a char associated with `t`. */ + string getChar(CharNode t); + } + + /** + * Implements an algorithm for computing all possible strings + * from following a tree of nodes (as described in `CharTree`). + * + * The string is build using one big concat, where all the chars are computed first. + * See `concretize`. + */ + module Concretizer { + private class Node = Impl::CharNode; + + private predicate getPrev = Impl::getPrev/1; + + private predicate isARelevantEnd = Impl::isARelevantEnd/1; + + private predicate getChar = Impl::getChar/1; + + /** Holds if `n` is on a path from the root to a leaf, and is therefore relevant for the results in `concretize`. */ + private predicate isRelevant(Node n) { + isARelevantEnd(n) + or + exists(Node succ | isRelevant(succ) | n = getPrev(succ)) + } + + /** Holds if `n` is a root with no predecessors. */ + private predicate isRoot(Node n) { not exists(getPrev(n)) } + + /** Gets the distance from a root to `n`. */ + private int nodeDepth(Node n) { + result = 0 and isRoot(n) + or + isRelevant(n) and + exists(Node prev | result = nodeDepth(prev) + 1 | prev = getPrev(n)) + } + + /** Gets an ancestor of `end`, where `end` is a node that should have a result in `concretize`. */ + private Node getAnAncestor(Node end) { isARelevantEnd(end) and result = getPrev*(end) } + + /** Gets the `i`th character on the path from the root to `n`. */ + pragma[noinline] + private string getPrefixChar(Node n, int i) { + exists(Node ancestor | + result = getChar(ancestor) and + ancestor = getAnAncestor(n) and + i = nodeDepth(ancestor) + ) + } + + /** Gets a string corresponding to `node`. */ + language[monotonicAggregates] + string concretize(Node n) { + result = strictconcat(int i | exists(getPrefixChar(n, i)) | getPrefixChar(n, i) order by i) + } + } +} diff --git a/shared/regex/codeql/regex/nfa/RegexpMatching.qll b/shared/regex/codeql/regex/nfa/RegexpMatching.qll new file mode 100644 index 00000000000..e4d5f4667eb --- /dev/null +++ b/shared/regex/codeql/regex/nfa/RegexpMatching.qll @@ -0,0 +1,176 @@ +/** + * Provides predicates for reasoning about which strings are matched by a regular expression, + * and for testing which capture groups are filled when a particular regexp matches a string. + */ + +private import NfaUtils as NfaUtils +private import codeql.regex.RegexTreeView + +/** + * A parameterized module implementing the analysis described in the above papers. + */ +module Make { + private import TreeImpl + import NfaUtils::Make + + /** A root term */ + class RootTerm instanceof RegExpTerm { + RootTerm() { this.isRootTerm() } + + /** Gets a string representation of this term. */ + string toString() { result = super.toString() } + + /** Holds if this term has the specified location. */ + predicate hasLocationInfo( + string filepath, int startline, int startcolumn, int endline, int endcolumn + ) { + super.hasLocationInfo(filepath, startline, startcolumn, endline, endcolumn) + } + } + + /** + * Holds if it should be tested whether `root` matches `str`. + * + * If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor. + * E.g. a regular expression `/foo$/` will match any string that ends with "foo", + * but if `ignorePrefix` is true, it will only match "foo". + * + * If `testWithGroups` is true, then the `RegexpMatching::fillsCaptureGroup` predicate can be used to determine which capture + * groups are filled by a string. + */ + signature predicate isRegexpMatchingCandidateSig( + RootTerm root, string str, boolean ignorePrefix, boolean testWithGroups + ); + + /** + * A module for determining if a regexp matches a given string, + * and reasoning about which capture groups are filled by a given string. + * + * The module parameter `isCandidate` determines which strings should be tested, + * and the results can be read from the `matches` and `fillsCaptureGroup` predicates. + */ + module RegexpMatching { + /** + * Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`. + * The regular expression is modeled as a non-determistic finite automaton, + * the regular expression can therefore be in multiple states after matching a character. + * + * It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state. + */ + private State getAState(RootTerm reg, int i, string str, boolean ignorePrefix) { + // start state, the -1 position before any chars have been matched + i = -1 and + isCandidate(reg, str, ignorePrefix, _) and + result.getRepr().getRootTerm() = reg and + isStartState(result) + or + // recursive case + result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix) + } + + /** + * Gets the next state after the `prev` state from `reg`. + * `prev` is the state after matching `fromIndex` chars in `str`, + * and the result is the state after matching `toIndex` chars in `str`. + * + * This predicate is used as a step relation in the forwards search (`getAState`), + * and also as a step relation in the later backwards search (`getAStateThatReachesAccept`). + */ + private State getAStateAfterMatching( + RootTerm reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix + ) { + // the basic recursive case - outlined into a noopt helper to make performance work out. + result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix) + or + // we can skip past word boundaries if the next char is a non-word char. + fromIndex = toIndex and + prev.getRepr() instanceof RegExpWordBoundary and + prev = getAState(reg, toIndex, str, ignorePrefix) and + after(prev.getRepr()) = result and + str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char. + } + + pragma[noopt] + private State getAStateAfterMatchingAux( + RootTerm reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix + ) { + prev = getAState(reg, fromIndex, str, ignorePrefix) and + fromIndex = toIndex - 1 and + exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and + not discardedPrefixStep(prev, result, ignorePrefix) + } + + /** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */ + private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) { + prev = mkMatch(any(RegExpRoot r)) and + ignorePrefix = true and + next = prev + } + + // The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`. + private predicate specializedDeltaClosed(State prev, string char, State next) { + deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next) + } + + // The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`. + pragma[noinline] + private InputSymbol specializedGetAnInputSymbolMatching(string char) { + exists(string s, RootTerm r | isCandidate(r, s, _, _) | char = s.charAt(_)) and + result = getAnInputSymbolMatching(char) + } + + /** + * Gets the `i`th state on a path to the accepting state when `reg` matches `str`. + * Starts with an accepting state as found by `getAState` and searches backwards + * to the start state through the reachable states (as found by `getAState`). + * + * This predicate satisfies the invariant that the result state can be reached with `i` steps from a start state, + * and an accepting state can be found after (`str.length() - 1 - i`) steps from the result. + * The result state is therefore always on a valid path where `reg` accepts `str`. + * + * This predicate is only used to find which capture groups a regular expression has filled, + * and thus the search is only performed for the strings in the `testWithGroups(..)` predicate. + */ + private State getAStateThatReachesAccept(RootTerm reg, int i, string str, boolean ignorePrefix) { + // base case, reaches an accepting state from the last state in `getAState(..)` + isCandidate(reg, str, ignorePrefix, true) and + i = str.length() - 1 and + result = getAState(reg, i, str, ignorePrefix) and + epsilonSucc*(result) = Accept(_) + or + // recursive case. `next` is the next state to be matched after matching `prev`. + // this predicate is doing a backwards search, so `prev` is the result we are looking for. + exists(State next, State prev, int fromIndex, int toIndex | + next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and + next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and + i = fromIndex and + result = prev + ) + } + + /** Gets the capture group number that `term` belongs to. */ + private int group(RegExpTerm term) { + exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp) + } + + /** + * Holds if `reg` matches `str`, where `str` is in the `isCandidate` predicate. + */ + predicate matches(RootTerm reg, string str) { + exists(State state | state = getAState(reg, str.length() - 1, str, _) | + epsilonSucc*(state) = Accept(_) + ) + } + + /** + * Holds if matching `str` against `reg` may fill capture group number `g`. + * Only holds if `str` is in the `testWithGroups` predicate. + */ + predicate fillsCaptureGroup(RootTerm reg, string str, int g) { + exists(State s | + s = getAStateThatReachesAccept(reg, _, str, _) and + g = group(s.getRepr()) + ) + } + } +} diff --git a/shared/regex/codeql/regex/nfa/SuperlinearBackTracking.qll b/shared/regex/codeql/regex/nfa/SuperlinearBackTracking.qll new file mode 100644 index 00000000000..efe5beb3b4a --- /dev/null +++ b/shared/regex/codeql/regex/nfa/SuperlinearBackTracking.qll @@ -0,0 +1,440 @@ +/** + * This module implements the analysis described in the paper: + * Valentin Wustholz, Oswaldo Olivo, Marijn J. H. Heule, and Isil Dillig: + * Static Detection of DoS Vulnerabilities in + * Programs that use Regular Expressions + * (Extended Version). + * (https://arxiv.org/pdf/1701.04045.pdf) + * + * Theorem 3 from the paper describes the basic idea. + * + * The following explains the idea using variables and predicate names that are used in the implementation: + * We consider a pair of repetitions, which we will call `pivot` and `succ`. + * + * We create a product automaton of 3-tuples of states (see `StateTuple`). + * There exists a transition `(a,b,c) -> (d,e,f)` in the product automaton + * iff there exists three transitions in the NFA `a->d, b->e, c->f` where those three + * transitions all match a shared character `char`. (see `getAThreewayIntersect`) + * + * We start a search in the product automaton at `(pivot, pivot, succ)`, + * and search for a series of transitions (a `Trace`), such that we end + * at `(pivot, succ, succ)` (see `isReachableFromStartTuple`). + * + * For example, consider the regular expression `/^\d*5\w*$/`. + * The search will start at the tuple `(\d*, \d*, \w*)` and search + * for a path to `(\d*, \w*, \w*)`. + * This path exists, and consists of a single transition in the product automaton, + * where the three corresponding NFA edges all match the character `"5"`. + * + * The start-state in the NFA has an any-transition to itself, this allows us to + * flag regular expressions such as `/a*$/` - which does not have a start anchor - + * and can thus start matching anywhere. + * + * The implementation is not perfect. + * It has the same suffix detection issue as the `js/redos` query, which can cause false positives. + * It also doesn't find all transitions in the product automaton, which can cause false negatives. + */ + +private import NfaUtils as NfaUtils +private import codeql.regex.RegexTreeView + +/** + * A parameterized module implementing the analysis described in the above papers. + */ +module Make { + private import TreeImpl + import NfaUtils::Make + + /** + * Gets any root (start) state of a regular expression. + */ + private State getRootState() { result = mkMatch(any(RegExpRoot r)) } + + private newtype TStateTuple = + MkStateTuple(State q1, State q2, State q3) { + // starts at (pivot, pivot, succ) + isStartLoops(q1, q3) and q1 = q2 + or + step(_, _, _, _, q1, q2, q3) and FeasibleTuple::isFeasibleTuple(q1, q2, q3) + } + + /** + * A state in the product automaton. + * The product automaton contains 3-tuples of states. + * + * We lazily only construct those states that we are actually + * going to need. + * Either a start state `(pivot, pivot, succ)`, or a state + * where there exists a transition from an already existing state. + * + * The exponential variant of this query (`js/redos`) uses an optimization + * trick where `q1 <= q2`. This trick cannot be used here as the order + * of the elements matter. + */ + class StateTuple extends TStateTuple { + State q1; + State q2; + State q3; + + StateTuple() { this = MkStateTuple(q1, q2, q3) } + + /** + * Gest a string representation of this tuple. + */ + string toString() { result = "(" + q1 + ", " + q2 + ", " + q3 + ")" } + + /** + * Holds if this tuple is `(r1, r2, r3)`. + */ + pragma[noinline] + predicate isTuple(State r1, State r2, State r3) { r1 = q1 and r2 = q2 and r3 = q3 } + } + + /** + * A module for determining feasible tuples for the product automaton. + * + * The implementation is split into many predicates for performance reasons. + */ + private module FeasibleTuple { + /** + * Holds if the tuple `(r1, r2, r3)` might be on path from a start-state to an end-state in the product automaton. + */ + pragma[inline] + predicate isFeasibleTuple(State r1, State r2, State r3) { + // The first element is either inside a repetition (or the start state itself) + isRepetitionOrStart(r1) and + // The last element is inside a repetition + stateInsideRepetition(r3) and + // The states are reachable in the NFA in the order r1 -> r2 -> r3 + delta+(r1) = r2 and + delta+(r2) = r3 and + // The first element can reach a beginning (the "pivot" state in a `(pivot, succ)` pair). + canReachABeginning(r1) and + // The last element can reach a target (the "succ" state in a `(pivot, succ)` pair). + canReachATarget(r3) + } + + /** + * Holds if `s` is either inside a repetition, or is the start state (which is a repetition). + */ + pragma[noinline] + private predicate isRepetitionOrStart(State s) { + stateInsideRepetition(s) or s = getRootState() + } + + /** + * Holds if state `s` might be inside a backtracking repetition. + */ + pragma[noinline] + private predicate stateInsideRepetition(State s) { + s.getRepr().getParent*() instanceof InfiniteRepetitionQuantifier + } + + /** + * Holds if there exists a path in the NFA from `s` to a "pivot" state + * (from a `(pivot, succ)` pair that starts the search). + */ + pragma[noinline] + private predicate canReachABeginning(State s) { + delta+(s) = any(State pivot | isStartLoops(pivot, _)) + } + + /** + * Holds if there exists a path in the NFA from `s` to a "succ" state + * (from a `(pivot, succ)` pair that starts the search). + */ + pragma[noinline] + private predicate canReachATarget(State s) { + delta+(s) = any(State succ | isStartLoops(_, succ)) + } + } + + /** + * Holds if `pivot` and `succ` are a pair of loops that could be the beginning of a quadratic blowup. + * + * There is a slight implementation difference compared to the paper: this predicate requires that `pivot != succ`. + * The case where `pivot = succ` causes exponential backtracking and is handled by the `js/redos` query. + */ + predicate isStartLoops(State pivot, State succ) { + pivot != succ and + succ.getRepr() instanceof InfiniteRepetitionQuantifier and + delta+(pivot) = succ and + ( + pivot.getRepr() instanceof InfiniteRepetitionQuantifier + or + pivot = mkMatch(any(RegExpRoot root)) + ) + } + + /** + * Gets a state for which there exists a transition in the NFA from `s'. + */ + State delta(State s) { delta(s, _, result) } + + /** + * Holds if there are transitions from the components of `q` to the corresponding + * components of `r` labelled with `s1`, `s2`, and `s3`, respectively. + */ + pragma[noinline] + predicate step(StateTuple q, InputSymbol s1, InputSymbol s2, InputSymbol s3, StateTuple r) { + exists(State r1, State r2, State r3 | + step(q, s1, s2, s3, r1, r2, r3) and r = MkStateTuple(r1, r2, r3) + ) + } + + /** + * Holds if there are transitions from the components of `q` to `r1`, `r2`, and `r3 + * labelled with `s1`, `s2`, and `s3`, respectively. + */ + pragma[noopt] + predicate step( + StateTuple q, InputSymbol s1, InputSymbol s2, InputSymbol s3, State r1, State r2, State r3 + ) { + exists(State q1, State q2, State q3 | q.isTuple(q1, q2, q3) | + deltaClosed(q1, s1, r1) and + deltaClosed(q2, s2, r2) and + deltaClosed(q3, s3, r3) and + // use noopt to force the join on `getAThreewayIntersect` to happen last. + exists(getAThreewayIntersect(s1, s2, s3)) + ) + } + + /** + * Gets a char that is matched by all the edges `s1`, `s2`, and `s3`. + * + * The result is not complete, and might miss some combination of edges that share some character. + */ + pragma[noinline] + string getAThreewayIntersect(InputSymbol s1, InputSymbol s2, InputSymbol s3) { + result = minAndMaxIntersect(s1, s2) and result = [intersect(s2, s3), intersect(s1, s3)] + or + result = minAndMaxIntersect(s1, s3) and result = [intersect(s2, s3), intersect(s1, s2)] + or + result = minAndMaxIntersect(s2, s3) and result = [intersect(s1, s2), intersect(s1, s3)] + } + + /** + * Gets the minimum and maximum characters that intersect between `a` and `b`. + * This predicate is used to limit the size of `getAThreewayIntersect`. + */ + pragma[noinline] + string minAndMaxIntersect(InputSymbol a, InputSymbol b) { + result = [min(intersect(a, b)), max(intersect(a, b))] + } + + private newtype TTrace = + Nil() or + Step(InputSymbol s1, InputSymbol s2, InputSymbol s3, TTrace t) { + isReachableFromStartTuple(_, _, t, s1, s2, s3, _, _) + } + + /** + * A list of tuples of input symbols that describe a path in the product automaton + * starting from some start state. + */ + class Trace extends TTrace { + /** + * Gets a string representation of this Trace that can be used for debug purposes. + */ + string toString() { + this = Nil() and result = "Nil()" + or + exists(InputSymbol s1, InputSymbol s2, InputSymbol s3, Trace t | this = Step(s1, s2, s3, t) | + result = "Step(" + s1 + ", " + s2 + ", " + s3 + ", " + t + ")" + ) + } + } + + /** + * Holds if there exists a transition from `r` to `q` in the product automaton. + * Notice that the arguments are flipped, and thus the direction is backwards. + */ + pragma[noinline] + predicate tupleDeltaBackwards(StateTuple q, StateTuple r) { step(r, _, _, _, q) } + + /** + * Holds if `tuple` is an end state in our search. + * That means there exists a pair of loops `(pivot, succ)` such that `tuple = (pivot, succ, succ)`. + */ + predicate isEndTuple(StateTuple tuple) { tuple = getAnEndTuple(_, _) } + + /** + * Gets the minimum length of a path from `r` to some an end state `end`. + * + * The implementation searches backwards from the end-tuple. + * This approach was chosen because it is way more efficient if the first predicate given to `shortestDistances` is small. + * The `end` argument must always be an end state. + */ + int distBackFromEnd(StateTuple r, StateTuple end) = + shortestDistances(isEndTuple/1, tupleDeltaBackwards/2)(end, r, result) + + /** + * Holds if there exists a pair of repetitions `(pivot, succ)` in the regular expression such that: + * `tuple` is reachable from `(pivot, pivot, succ)` in the product automaton, + * and there is a distance of `dist` from `tuple` to the nearest end-tuple `(pivot, succ, succ)`, + * and a path from a start-state to `tuple` follows the transitions in `trace`. + */ + private predicate isReachableFromStartTuple( + State pivot, State succ, StateTuple tuple, Trace trace, int dist + ) { + exists(InputSymbol s1, InputSymbol s2, InputSymbol s3, Trace v | + isReachableFromStartTuple(pivot, succ, v, s1, s2, s3, tuple, dist) and + trace = Step(s1, s2, s3, v) + ) + } + + private predicate isReachableFromStartTuple( + State pivot, State succ, Trace trace, InputSymbol s1, InputSymbol s2, InputSymbol s3, + StateTuple tuple, int dist + ) { + // base case. + exists(State q1, State q2, State q3 | + isStartLoops(pivot, succ) and + step(MkStateTuple(pivot, pivot, succ), s1, s2, s3, tuple) and + tuple = MkStateTuple(q1, q2, q3) and + trace = Nil() and + dist = distBackFromEnd(tuple, MkStateTuple(pivot, succ, succ)) + ) + or + // recursive case + exists(StateTuple p | + isReachableFromStartTuple(pivot, succ, p, trace, dist + 1) and + dist = distBackFromEnd(tuple, MkStateTuple(pivot, succ, succ)) and + step(p, s1, s2, s3, tuple) + ) + } + + /** + * Gets the tuple `(pivot, succ, succ)` from the product automaton. + */ + StateTuple getAnEndTuple(State pivot, State succ) { + isStartLoops(pivot, succ) and + result = MkStateTuple(pivot, succ, succ) + } + + /** An implementation of a chain containing chars for use by `Concretizer`. */ + private module CharTreeImpl implements CharTree { + class CharNode = Trace; + + CharNode getPrev(CharNode t) { t = Step(_, _, _, result) } + + /** Holds if `n` is used in `isPumpable`. */ + predicate isARelevantEnd(CharNode n) { + exists(State pivot, State succ | + isReachableFromStartTuple(pivot, succ, getAnEndTuple(pivot, succ), n, _) + ) + } + + string getChar(CharNode t) { + exists(InputSymbol s1, InputSymbol s2, InputSymbol s3 | t = Step(s1, s2, s3, _) | + result = getAThreewayIntersect(s1, s2, s3) + ) + } + } + + /** + * Holds if matching repetitions of `pump` can: + * 1) Transition from `pivot` back to `pivot`. + * 2) Transition from `pivot` to `succ`. + * 3) Transition from `succ` to `succ`. + * + * From theorem 3 in the paper linked in the top of this file we can therefore conclude that + * the regular expression has polynomial backtracking - if a rejecting suffix exists. + * + * This predicate is used by `SuperLinearReDoSConfiguration`, and the final results are + * available in the `hasReDoSResult` predicate. + */ + predicate isPumpable(State pivot, State succ, string pump) { + exists(StateTuple q, Trace t | + isReachableFromStartTuple(pivot, succ, q, t, _) and + q = getAnEndTuple(pivot, succ) and + pump = Concretizer ::concretize(t) + ) + } + + /** + * Holds if states starting in `state` can have polynomial backtracking with the string `pump`. + */ + predicate isReDoSCandidate(State state, string pump) { isPumpable(_, state, pump) } + + /** + * Holds if repetitions of `pump` at `t` will cause polynomial backtracking. + */ + predicate polynomialReDoS(RegExpTerm t, string pump, string prefixMsg, RegExpTerm prev) { + exists(State s, State pivot | + ReDoSPruning ::hasReDoSResult(t, pump, s, prefixMsg) and + isPumpable(pivot, s, _) and + prev = pivot.getRepr() + ) + } + + /** + * Gets a message for why `term` can cause polynomial backtracking. + */ + string getReasonString(RegExpTerm term, string pump, string prefixMsg, RegExpTerm prev) { + polynomialReDoS(term, pump, prefixMsg, prev) and + result = + "Strings " + prefixMsg + "with many repetitions of '" + pump + + "' can start matching anywhere after the start of the preceeding " + prev + } + + /** + * A term that may cause a regular expression engine to perform a + * polynomial number of match attempts, relative to the input length. + */ + class PolynomialBackTrackingTerm instanceof InfiniteRepetitionQuantifier { + string reason; + string pump; + string prefixMsg; + RegExpTerm prev; + + PolynomialBackTrackingTerm() { + reason = getReasonString(this, pump, prefixMsg, prev) and + // there might be many reasons for this term to have polynomial backtracking - we pick the shortest one. + reason = + min(string msg | msg = getReasonString(this, _, _, _) | msg order by msg.length(), msg) + } + + /** + * Holds if all non-empty successors to the polynomial backtracking term matches the end of the line. + */ + predicate isAtEndLine() { + forall(RegExpTerm succ | super.getSuccessor+() = succ and not matchesEpsilon(succ) | + succ instanceof RegExpDollar + ) + } + + /** + * Gets the string that should be repeated to cause this regular expression to perform polynomially. + */ + string getPumpString() { result = pump } + + /** + * Gets a message for which prefix a matching string must start with for this term to cause polynomial backtracking. + */ + string getPrefixMessage() { result = prefixMsg } + + /** + * Gets a predecessor to `this`, which also loops on the pump string, and thereby causes polynomial backtracking. + */ + RegExpTerm getPreviousLoop() { result = prev } + + /** + * Gets the reason for the number of match attempts. + */ + string getReason() { result = reason } + + /** Gets a string representation of this term. */ + string toString() { result = super.toString() } + + /** Gets the outermost term of this regular expression. */ + RegExpTerm getRootTerm() { result = super.getRootTerm() } + + /** Holds if this term has the specific location. */ + predicate hasLocationInfo( + string filepath, int startline, int startcolumn, int endline, int endcolumn + ) { + super.hasLocationInfo(filepath, startline, startcolumn, endline, endcolumn) + } + } +} diff --git a/shared/regex/qlpack.yml b/shared/regex/qlpack.yml new file mode 100644 index 00000000000..4b25672b6c5 --- /dev/null +++ b/shared/regex/qlpack.yml @@ -0,0 +1,5 @@ +name: codeql/regex +version: 0.0.1-dev +groups: shared +library: true +dependencies: diff --git a/shared/ssa/codeql/ssa/Ssa.qll b/shared/ssa/codeql/ssa/Ssa.qll index 886e4128e26..19f31f7c8bb 100644 --- a/shared/ssa/codeql/ssa/Ssa.qll +++ b/shared/ssa/codeql/ssa/Ssa.qll @@ -9,7 +9,10 @@ signature module InputSig { * A basic block, that is, a maximal straight-line sequence of control flow nodes * without branches or joins. */ - class BasicBlock; + class BasicBlock { + /** Gets a textual representation of this basic block. */ + string toString(); + } /** * Gets the basic block that immediately dominates basic block `bb`, if any. @@ -43,7 +46,10 @@ signature module InputSig { class ExitBasicBlock extends BasicBlock; /** A variable that can be SSA converted. */ - class SourceVariable; + class SourceVariable { + /** Gets a textual representation of this variable. */ + string toString(); + } /** * Holds if the `i`th node of basic block `bb` is a (potential) write to source @@ -846,8 +852,6 @@ module Make { } /** Provides a set of consistency queries. */ - // TODO: Make these `query` predicates once class signatures are supported - // (`SourceVariable` and `BasicBlock` must have `toString`) module Consistency { /** A definition that is relevant for the consistency queries. */ abstract class RelevantDefinition extends Definition { @@ -858,19 +862,19 @@ module Make { } /** Holds if a read can be reached from multiple definitions. */ - predicate nonUniqueDef(RelevantDefinition def, SourceVariable v, BasicBlock bb, int i) { + query predicate nonUniqueDef(RelevantDefinition def, SourceVariable v, BasicBlock bb, int i) { ssaDefReachesRead(v, def, bb, i) and not exists(unique(Definition def0 | ssaDefReachesRead(v, def0, bb, i))) } /** Holds if a read cannot be reached from a definition. */ - predicate readWithoutDef(SourceVariable v, BasicBlock bb, int i) { + query predicate readWithoutDef(SourceVariable v, BasicBlock bb, int i) { variableRead(bb, i, v, _) and not ssaDefReachesRead(v, _, bb, i) } /** Holds if a definition cannot reach a read. */ - predicate deadDef(RelevantDefinition def, SourceVariable v) { + query predicate deadDef(RelevantDefinition def, SourceVariable v) { v = def.getSourceVariable() and not ssaDefReachesRead(_, def, _, _) and not phiHasInputFromBlock(_, def, _) and @@ -878,7 +882,7 @@ module Make { } /** Holds if a read is not dominated by a definition. */ - predicate notDominatedByDef(RelevantDefinition def, SourceVariable v, BasicBlock bb, int i) { + query predicate notDominatedByDef(RelevantDefinition def, SourceVariable v, BasicBlock bb, int i) { exists(BasicBlock bbDef, int iDef | def.definesAt(v, bbDef, iDef) | ssaDefReachesReadWithinBlock(v, def, bb, i) and (bb != bbDef or i < iDef) diff --git a/swift/actions/database-upgrade-scripts/action.yml b/swift/actions/database-upgrade-scripts/action.yml new file mode 100644 index 00000000000..26f95d44b8a --- /dev/null +++ b/swift/actions/database-upgrade-scripts/action.yml @@ -0,0 +1,23 @@ +name: Check Swift database upgrade/downgrade scripts +runs: + using: composite + steps: + - name: Check upgrade scripts + shell: bash + working-directory: swift + run: | + echo > empty.trap + codeql dataset import -S ql/lib/upgrades/initial/swift.dbscheme testdb empty.trap + codeql dataset upgrade testdb --additional-packs ql/lib + diff -q testdb/swift.dbscheme ql/lib/swift.dbscheme + - name: Check downgrade scripts + shell: bash + working-directory: swift + run: | + echo > empty.trap + rm -rf testdb + codeql dataset import -S ql/lib/swift.dbscheme testdb empty.trap + codeql resolve upgrades --format=lines --allow-downgrades --additional-packs downgrades \ + --dbscheme=ql/lib/swift.dbscheme --target-dbscheme=downgrades/initial/swift.dbscheme | + xargs -r codeql execute upgrades testdb + diff -q testdb/swift.dbscheme downgrades/initial/swift.dbscheme diff --git a/swift/codegen/generators/qlgen.py b/swift/codegen/generators/qlgen.py index 2172a9517e6..662a782dbd7 100755 --- a/swift/codegen/generators/qlgen.py +++ b/swift/codegen/generators/qlgen.py @@ -63,6 +63,7 @@ abbreviations = { "repr": "representation", "param": "parameter", "int": "integer", + "var": "variable", } abbreviations.update({f"{k}s": f"{v}s" for k, v in abbreviations.items()}) diff --git a/swift/downgrades/initial/swift.dbscheme b/swift/downgrades/initial/swift.dbscheme new file mode 100644 index 00000000000..ceca289a0ff --- /dev/null +++ b/swift/downgrades/initial/swift.dbscheme @@ -0,0 +1,2493 @@ +// generated by codegen/codegen.py + +// from prefix.dbscheme +/** + * The source location of the snapshot. + */ +sourceLocationPrefix( + string prefix: string ref +); + + +// from schema.py + +@element = + @callable +| @file +| @generic_context +| @iterable_decl_context +| @locatable +| @location +| @type +; + +#keyset[id] +element_is_unknown( + int id: @element ref +); + +@callable = + @abstract_closure_expr +| @abstract_function_decl +; + +#keyset[id] +callable_self_params( + int id: @callable ref, + int self_param: @param_decl_or_none ref +); + +#keyset[id, index] +callable_params( + int id: @callable ref, + int index: int ref, + int param: @param_decl_or_none ref +); + +#keyset[id] +callable_bodies( + int id: @callable ref, + int body: @brace_stmt_or_none ref +); + +@file = + @db_file +| @unknown_file +; + +#keyset[id] +files( + int id: @file ref, + string name: string ref +); + +@locatable = + @argument +| @ast_node +| @comment +| @diagnostics +| @error_element +; + +#keyset[id] +locatable_locations( + int id: @locatable ref, + int location: @location_or_none ref +); + +@location = + @db_location +| @unknown_location +; + +#keyset[id] +locations( + int id: @location ref, + int file: @file_or_none ref, + int start_line: int ref, + int start_column: int ref, + int end_line: int ref, + int end_column: int ref +); + +@ast_node = + @case_label_item +| @condition_element +| @decl +| @expr +| @pattern +| @stmt +| @stmt_condition +| @type_repr +; + +comments( + unique int id: @comment, + string text: string ref +); + +db_files( + unique int id: @db_file +); + +db_locations( + unique int id: @db_location +); + +diagnostics( + unique int id: @diagnostics, + string text: string ref, + int kind: int ref +); + +@error_element = + @error_expr +| @error_type +| @overloaded_decl_ref_expr +| @unresolved_decl_ref_expr +| @unresolved_dot_expr +| @unresolved_member_chain_result_expr +| @unresolved_member_expr +| @unresolved_pattern_expr +| @unresolved_specialize_expr +| @unresolved_type +| @unresolved_type_conversion_expr +| @unspecified_element +; + +unknown_files( + unique int id: @unknown_file +); + +unknown_locations( + unique int id: @unknown_location +); + +unspecified_elements( + unique int id: @unspecified_element, + string property: string ref, + string error: string ref +); + +#keyset[id] +unspecified_element_parents( + int id: @unspecified_element ref, + int parent: @element ref +); + +#keyset[id] +unspecified_element_indices( + int id: @unspecified_element ref, + int index: int ref +); + +@decl = + @enum_case_decl +| @extension_decl +| @if_config_decl +| @import_decl +| @missing_member_decl +| @operator_decl +| @pattern_binding_decl +| @pound_diagnostic_decl +| @precedence_group_decl +| @top_level_code_decl +| @value_decl +; + +#keyset[id] +decls( //dir=decl + int id: @decl ref, + int module: @module_decl_or_none ref +); + +@generic_context = + @abstract_function_decl +| @extension_decl +| @generic_type_decl +| @subscript_decl +; + +#keyset[id, index] +generic_context_generic_type_params( //dir=decl + int id: @generic_context ref, + int index: int ref, + int generic_type_param: @generic_type_param_decl_or_none ref +); + +@iterable_decl_context = + @extension_decl +| @nominal_type_decl +; + +#keyset[id, index] +iterable_decl_context_members( //dir=decl + int id: @iterable_decl_context ref, + int index: int ref, + int member: @decl_or_none ref +); + +enum_case_decls( //dir=decl + unique int id: @enum_case_decl +); + +#keyset[id, index] +enum_case_decl_elements( //dir=decl + int id: @enum_case_decl ref, + int index: int ref, + int element: @enum_element_decl_or_none ref +); + +extension_decls( //dir=decl + unique int id: @extension_decl, + int extended_type_decl: @nominal_type_decl_or_none ref +); + +if_config_decls( //dir=decl + unique int id: @if_config_decl +); + +#keyset[id, index] +if_config_decl_active_elements( //dir=decl + int id: @if_config_decl ref, + int index: int ref, + int active_element: @ast_node_or_none ref +); + +import_decls( //dir=decl + unique int id: @import_decl +); + +#keyset[id] +import_decl_is_exported( //dir=decl + int id: @import_decl ref +); + +#keyset[id] +import_decl_imported_modules( //dir=decl + int id: @import_decl ref, + int imported_module: @module_decl_or_none ref +); + +#keyset[id, index] +import_decl_declarations( //dir=decl + int id: @import_decl ref, + int index: int ref, + int declaration: @value_decl_or_none ref +); + +missing_member_decls( //dir=decl + unique int id: @missing_member_decl, + string name: string ref +); + +@operator_decl = + @infix_operator_decl +| @postfix_operator_decl +| @prefix_operator_decl +; + +#keyset[id] +operator_decls( //dir=decl + int id: @operator_decl ref, + string name: string ref +); + +pattern_binding_decls( //dir=decl + unique int id: @pattern_binding_decl +); + +#keyset[id, index] +pattern_binding_decl_inits( //dir=decl + int id: @pattern_binding_decl ref, + int index: int ref, + int init: @expr_or_none ref +); + +#keyset[id, index] +pattern_binding_decl_patterns( //dir=decl + int id: @pattern_binding_decl ref, + int index: int ref, + int pattern: @pattern_or_none ref +); + +pound_diagnostic_decls( //dir=decl + unique int id: @pound_diagnostic_decl, + int kind: int ref, + int message: @string_literal_expr_or_none ref +); + +precedence_group_decls( //dir=decl + unique int id: @precedence_group_decl +); + +top_level_code_decls( //dir=decl + unique int id: @top_level_code_decl, + int body: @brace_stmt_or_none ref +); + +@value_decl = + @abstract_function_decl +| @abstract_storage_decl +| @enum_element_decl +| @type_decl +; + +#keyset[id] +value_decls( //dir=decl + int id: @value_decl ref, + int interface_type: @type_or_none ref +); + +@abstract_function_decl = + @constructor_decl +| @destructor_decl +| @func_decl +; + +#keyset[id] +abstract_function_decls( //dir=decl + int id: @abstract_function_decl ref, + string name: string ref +); + +@abstract_storage_decl = + @subscript_decl +| @var_decl +; + +#keyset[id, index] +abstract_storage_decl_accessor_decls( //dir=decl + int id: @abstract_storage_decl ref, + int index: int ref, + int accessor_decl: @accessor_decl_or_none ref +); + +enum_element_decls( //dir=decl + unique int id: @enum_element_decl, + string name: string ref +); + +#keyset[id, index] +enum_element_decl_params( //dir=decl + int id: @enum_element_decl ref, + int index: int ref, + int param: @param_decl_or_none ref +); + +infix_operator_decls( //dir=decl + unique int id: @infix_operator_decl +); + +#keyset[id] +infix_operator_decl_precedence_groups( //dir=decl + int id: @infix_operator_decl ref, + int precedence_group: @precedence_group_decl_or_none ref +); + +postfix_operator_decls( //dir=decl + unique int id: @postfix_operator_decl +); + +prefix_operator_decls( //dir=decl + unique int id: @prefix_operator_decl +); + +@type_decl = + @abstract_type_param_decl +| @generic_type_decl +| @module_decl +; + +#keyset[id] +type_decls( //dir=decl + int id: @type_decl ref, + string name: string ref +); + +#keyset[id, index] +type_decl_base_types( //dir=decl + int id: @type_decl ref, + int index: int ref, + int base_type: @type_or_none ref +); + +@abstract_type_param_decl = + @associated_type_decl +| @generic_type_param_decl +; + +constructor_decls( //dir=decl + unique int id: @constructor_decl +); + +destructor_decls( //dir=decl + unique int id: @destructor_decl +); + +@func_decl = + @accessor_decl +| @concrete_func_decl +; + +@generic_type_decl = + @nominal_type_decl +| @opaque_type_decl +| @type_alias_decl +; + +module_decls( //dir=decl + unique int id: @module_decl +); + +#keyset[id] +module_decl_is_builtin_module( //dir=decl + int id: @module_decl ref +); + +#keyset[id] +module_decl_is_system_module( //dir=decl + int id: @module_decl ref +); + +#keyset[id, index] +module_decl_imported_modules( //dir=decl + int id: @module_decl ref, + int index: int ref, + int imported_module: @module_decl_or_none ref +); + +#keyset[id, index] +module_decl_exported_modules( //dir=decl + int id: @module_decl ref, + int index: int ref, + int exported_module: @module_decl_or_none ref +); + +subscript_decls( //dir=decl + unique int id: @subscript_decl, + int element_type: @type_or_none ref +); + +#keyset[id, index] +subscript_decl_params( //dir=decl + int id: @subscript_decl ref, + int index: int ref, + int param: @param_decl_or_none ref +); + +@var_decl = + @concrete_var_decl +| @param_decl +; + +#keyset[id] +var_decls( //dir=decl + int id: @var_decl ref, + string name: string ref, + int type_: @type_or_none ref +); + +#keyset[id] +var_decl_attached_property_wrapper_types( //dir=decl + int id: @var_decl ref, + int attached_property_wrapper_type: @type_or_none ref +); + +#keyset[id] +var_decl_parent_patterns( //dir=decl + int id: @var_decl ref, + int parent_pattern: @pattern_or_none ref +); + +#keyset[id] +var_decl_parent_initializers( //dir=decl + int id: @var_decl ref, + int parent_initializer: @expr_or_none ref +); + +#keyset[id] +var_decl_property_wrapper_backing_var_bindings( //dir=decl + int id: @var_decl ref, + int property_wrapper_backing_var_binding: @pattern_binding_decl_or_none ref +); + +#keyset[id] +var_decl_property_wrapper_backing_vars( //dir=decl + int id: @var_decl ref, + int property_wrapper_backing_var: @var_decl_or_none ref +); + +#keyset[id] +var_decl_property_wrapper_projection_var_bindings( //dir=decl + int id: @var_decl ref, + int property_wrapper_projection_var_binding: @pattern_binding_decl_or_none ref +); + +#keyset[id] +var_decl_property_wrapper_projection_vars( //dir=decl + int id: @var_decl ref, + int property_wrapper_projection_var: @var_decl_or_none ref +); + +accessor_decls( //dir=decl + unique int id: @accessor_decl +); + +#keyset[id] +accessor_decl_is_getter( //dir=decl + int id: @accessor_decl ref +); + +#keyset[id] +accessor_decl_is_setter( //dir=decl + int id: @accessor_decl ref +); + +#keyset[id] +accessor_decl_is_will_set( //dir=decl + int id: @accessor_decl ref +); + +#keyset[id] +accessor_decl_is_did_set( //dir=decl + int id: @accessor_decl ref +); + +associated_type_decls( //dir=decl + unique int id: @associated_type_decl +); + +concrete_func_decls( //dir=decl + unique int id: @concrete_func_decl +); + +concrete_var_decls( //dir=decl + unique int id: @concrete_var_decl, + int introducer_int: int ref +); + +generic_type_param_decls( //dir=decl + unique int id: @generic_type_param_decl +); + +@nominal_type_decl = + @class_decl +| @enum_decl +| @protocol_decl +| @struct_decl +; + +#keyset[id] +nominal_type_decls( //dir=decl + int id: @nominal_type_decl ref, + int type_: @type_or_none ref +); + +opaque_type_decls( //dir=decl + unique int id: @opaque_type_decl, + int naming_declaration: @value_decl_or_none ref +); + +#keyset[id, index] +opaque_type_decl_opaque_generic_params( //dir=decl + int id: @opaque_type_decl ref, + int index: int ref, + int opaque_generic_param: @generic_type_param_type_or_none ref +); + +param_decls( //dir=decl + unique int id: @param_decl +); + +#keyset[id] +param_decl_is_inout( //dir=decl + int id: @param_decl ref +); + +#keyset[id] +param_decl_property_wrapper_local_wrapped_var_bindings( //dir=decl + int id: @param_decl ref, + int property_wrapper_local_wrapped_var_binding: @pattern_binding_decl_or_none ref +); + +#keyset[id] +param_decl_property_wrapper_local_wrapped_vars( //dir=decl + int id: @param_decl ref, + int property_wrapper_local_wrapped_var: @var_decl_or_none ref +); + +type_alias_decls( //dir=decl + unique int id: @type_alias_decl +); + +class_decls( //dir=decl + unique int id: @class_decl +); + +enum_decls( //dir=decl + unique int id: @enum_decl +); + +protocol_decls( //dir=decl + unique int id: @protocol_decl +); + +struct_decls( //dir=decl + unique int id: @struct_decl +); + +arguments( //dir=expr + unique int id: @argument, + string label: string ref, + int expr: @expr_or_none ref +); + +@expr = + @abstract_closure_expr +| @any_try_expr +| @applied_property_wrapper_expr +| @apply_expr +| @assign_expr +| @bind_optional_expr +| @capture_list_expr +| @collection_expr +| @decl_ref_expr +| @default_argument_expr +| @discard_assignment_expr +| @dot_syntax_base_ignored_expr +| @dynamic_type_expr +| @enum_is_case_expr +| @error_expr +| @explicit_cast_expr +| @force_value_expr +| @identity_expr +| @if_expr +| @implicit_conversion_expr +| @in_out_expr +| @key_path_application_expr +| @key_path_dot_expr +| @key_path_expr +| @lazy_initializer_expr +| @literal_expr +| @lookup_expr +| @make_temporarily_escapable_expr +| @obj_c_selector_expr +| @one_way_expr +| @opaque_value_expr +| @open_existential_expr +| @optional_evaluation_expr +| @other_constructor_decl_ref_expr +| @overloaded_decl_ref_expr +| @property_wrapper_value_placeholder_expr +| @rebind_self_in_constructor_expr +| @sequence_expr +| @super_ref_expr +| @tap_expr +| @tuple_element_expr +| @tuple_expr +| @type_expr +| @unresolved_decl_ref_expr +| @unresolved_dot_expr +| @unresolved_member_expr +| @unresolved_pattern_expr +| @unresolved_specialize_expr +| @vararg_expansion_expr +; + +#keyset[id] +expr_types( //dir=expr + int id: @expr ref, + int type_: @type_or_none ref +); + +@abstract_closure_expr = + @auto_closure_expr +| @closure_expr +; + +@any_try_expr = + @force_try_expr +| @optional_try_expr +| @try_expr +; + +#keyset[id] +any_try_exprs( //dir=expr + int id: @any_try_expr ref, + int sub_expr: @expr_or_none ref +); + +applied_property_wrapper_exprs( //dir=expr + unique int id: @applied_property_wrapper_expr, + int kind: int ref, + int value: @expr_or_none ref, + int param: @param_decl_or_none ref +); + +@apply_expr = + @binary_expr +| @call_expr +| @postfix_unary_expr +| @prefix_unary_expr +| @self_apply_expr +; + +#keyset[id] +apply_exprs( //dir=expr + int id: @apply_expr ref, + int function: @expr_or_none ref +); + +#keyset[id, index] +apply_expr_arguments( //dir=expr + int id: @apply_expr ref, + int index: int ref, + int argument: @argument_or_none ref +); + +assign_exprs( //dir=expr + unique int id: @assign_expr, + int dest: @expr_or_none ref, + int source: @expr_or_none ref +); + +bind_optional_exprs( //dir=expr + unique int id: @bind_optional_expr, + int sub_expr: @expr_or_none ref +); + +capture_list_exprs( //dir=expr + unique int id: @capture_list_expr, + int closure_body: @closure_expr_or_none ref +); + +#keyset[id, index] +capture_list_expr_binding_decls( //dir=expr + int id: @capture_list_expr ref, + int index: int ref, + int binding_decl: @pattern_binding_decl_or_none ref +); + +@collection_expr = + @array_expr +| @dictionary_expr +; + +decl_ref_exprs( //dir=expr + unique int id: @decl_ref_expr, + int decl: @decl_or_none ref +); + +#keyset[id, index] +decl_ref_expr_replacement_types( //dir=expr + int id: @decl_ref_expr ref, + int index: int ref, + int replacement_type: @type_or_none ref +); + +#keyset[id] +decl_ref_expr_has_direct_to_storage_semantics( //dir=expr + int id: @decl_ref_expr ref +); + +#keyset[id] +decl_ref_expr_has_direct_to_implementation_semantics( //dir=expr + int id: @decl_ref_expr ref +); + +#keyset[id] +decl_ref_expr_has_ordinary_semantics( //dir=expr + int id: @decl_ref_expr ref +); + +default_argument_exprs( //dir=expr + unique int id: @default_argument_expr, + int param_decl: @param_decl_or_none ref, + int param_index: int ref +); + +#keyset[id] +default_argument_expr_caller_side_defaults( //dir=expr + int id: @default_argument_expr ref, + int caller_side_default: @expr_or_none ref +); + +discard_assignment_exprs( //dir=expr + unique int id: @discard_assignment_expr +); + +dot_syntax_base_ignored_exprs( //dir=expr + unique int id: @dot_syntax_base_ignored_expr, + int qualifier: @expr_or_none ref, + int sub_expr: @expr_or_none ref +); + +dynamic_type_exprs( //dir=expr + unique int id: @dynamic_type_expr, + int base: @expr_or_none ref +); + +enum_is_case_exprs( //dir=expr + unique int id: @enum_is_case_expr, + int sub_expr: @expr_or_none ref, + int element: @enum_element_decl_or_none ref +); + +error_exprs( //dir=expr + unique int id: @error_expr +); + +@explicit_cast_expr = + @checked_cast_expr +| @coerce_expr +; + +#keyset[id] +explicit_cast_exprs( //dir=expr + int id: @explicit_cast_expr ref, + int sub_expr: @expr_or_none ref +); + +force_value_exprs( //dir=expr + unique int id: @force_value_expr, + int sub_expr: @expr_or_none ref +); + +@identity_expr = + @await_expr +| @dot_self_expr +| @paren_expr +| @unresolved_member_chain_result_expr +; + +#keyset[id] +identity_exprs( //dir=expr + int id: @identity_expr ref, + int sub_expr: @expr_or_none ref +); + +if_exprs( //dir=expr + unique int id: @if_expr, + int condition: @expr_or_none ref, + int then_expr: @expr_or_none ref, + int else_expr: @expr_or_none ref +); + +@implicit_conversion_expr = + @any_hashable_erasure_expr +| @archetype_to_super_expr +| @array_to_pointer_expr +| @bridge_from_obj_c_expr +| @bridge_to_obj_c_expr +| @class_metatype_to_object_expr +| @collection_upcast_conversion_expr +| @conditional_bridge_from_obj_c_expr +| @covariant_function_conversion_expr +| @covariant_return_conversion_expr +| @derived_to_base_expr +| @destructure_tuple_expr +| @differentiable_function_expr +| @differentiable_function_extract_original_expr +| @erasure_expr +| @existential_metatype_to_object_expr +| @foreign_object_conversion_expr +| @function_conversion_expr +| @in_out_to_pointer_expr +| @inject_into_optional_expr +| @linear_function_expr +| @linear_function_extract_original_expr +| @linear_to_differentiable_function_expr +| @load_expr +| @metatype_conversion_expr +| @pointer_to_pointer_expr +| @protocol_metatype_to_object_expr +| @string_to_pointer_expr +| @underlying_to_opaque_expr +| @unevaluated_instance_expr +| @unresolved_type_conversion_expr +; + +#keyset[id] +implicit_conversion_exprs( //dir=expr + int id: @implicit_conversion_expr ref, + int sub_expr: @expr_or_none ref +); + +in_out_exprs( //dir=expr + unique int id: @in_out_expr, + int sub_expr: @expr_or_none ref +); + +key_path_application_exprs( //dir=expr + unique int id: @key_path_application_expr, + int base: @expr_or_none ref, + int key_path: @expr_or_none ref +); + +key_path_dot_exprs( //dir=expr + unique int id: @key_path_dot_expr +); + +key_path_exprs( //dir=expr + unique int id: @key_path_expr +); + +#keyset[id] +key_path_expr_roots( //dir=expr + int id: @key_path_expr ref, + int root: @type_repr_or_none ref +); + +#keyset[id] +key_path_expr_parsed_paths( //dir=expr + int id: @key_path_expr ref, + int parsed_path: @expr_or_none ref +); + +lazy_initializer_exprs( //dir=expr + unique int id: @lazy_initializer_expr, + int sub_expr: @expr_or_none ref +); + +@literal_expr = + @builtin_literal_expr +| @interpolated_string_literal_expr +| @nil_literal_expr +| @object_literal_expr +| @regex_literal_expr +; + +@lookup_expr = + @dynamic_lookup_expr +| @member_ref_expr +| @method_ref_expr +| @subscript_expr +; + +#keyset[id] +lookup_exprs( //dir=expr + int id: @lookup_expr ref, + int base: @expr_or_none ref +); + +#keyset[id] +lookup_expr_members( //dir=expr + int id: @lookup_expr ref, + int member: @decl_or_none ref +); + +make_temporarily_escapable_exprs( //dir=expr + unique int id: @make_temporarily_escapable_expr, + int escaping_closure: @opaque_value_expr_or_none ref, + int nonescaping_closure: @expr_or_none ref, + int sub_expr: @expr_or_none ref +); + +obj_c_selector_exprs( //dir=expr + unique int id: @obj_c_selector_expr, + int sub_expr: @expr_or_none ref, + int method: @abstract_function_decl_or_none ref +); + +one_way_exprs( //dir=expr + unique int id: @one_way_expr, + int sub_expr: @expr_or_none ref +); + +opaque_value_exprs( //dir=expr + unique int id: @opaque_value_expr +); + +open_existential_exprs( //dir=expr + unique int id: @open_existential_expr, + int sub_expr: @expr_or_none ref, + int existential: @expr_or_none ref, + int opaque_expr: @opaque_value_expr_or_none ref +); + +optional_evaluation_exprs( //dir=expr + unique int id: @optional_evaluation_expr, + int sub_expr: @expr_or_none ref +); + +other_constructor_decl_ref_exprs( //dir=expr + unique int id: @other_constructor_decl_ref_expr, + int constructor_decl: @constructor_decl_or_none ref +); + +overloaded_decl_ref_exprs( //dir=expr + unique int id: @overloaded_decl_ref_expr +); + +#keyset[id, index] +overloaded_decl_ref_expr_possible_declarations( //dir=expr + int id: @overloaded_decl_ref_expr ref, + int index: int ref, + int possible_declaration: @value_decl_or_none ref +); + +property_wrapper_value_placeholder_exprs( //dir=expr + unique int id: @property_wrapper_value_placeholder_expr, + int placeholder: @opaque_value_expr_or_none ref +); + +#keyset[id] +property_wrapper_value_placeholder_expr_wrapped_values( //dir=expr + int id: @property_wrapper_value_placeholder_expr ref, + int wrapped_value: @expr_or_none ref +); + +rebind_self_in_constructor_exprs( //dir=expr + unique int id: @rebind_self_in_constructor_expr, + int sub_expr: @expr_or_none ref, + int self: @var_decl_or_none ref +); + +sequence_exprs( //dir=expr + unique int id: @sequence_expr +); + +#keyset[id, index] +sequence_expr_elements( //dir=expr + int id: @sequence_expr ref, + int index: int ref, + int element: @expr_or_none ref +); + +super_ref_exprs( //dir=expr + unique int id: @super_ref_expr, + int self: @var_decl_or_none ref +); + +tap_exprs( //dir=expr + unique int id: @tap_expr, + int body: @brace_stmt_or_none ref, + int var: @var_decl_or_none ref +); + +#keyset[id] +tap_expr_sub_exprs( //dir=expr + int id: @tap_expr ref, + int sub_expr: @expr_or_none ref +); + +tuple_element_exprs( //dir=expr + unique int id: @tuple_element_expr, + int sub_expr: @expr_or_none ref, + int index: int ref +); + +tuple_exprs( //dir=expr + unique int id: @tuple_expr +); + +#keyset[id, index] +tuple_expr_elements( //dir=expr + int id: @tuple_expr ref, + int index: int ref, + int element: @expr_or_none ref +); + +type_exprs( //dir=expr + unique int id: @type_expr +); + +#keyset[id] +type_expr_type_reprs( //dir=expr + int id: @type_expr ref, + int type_repr: @type_repr_or_none ref +); + +unresolved_decl_ref_exprs( //dir=expr + unique int id: @unresolved_decl_ref_expr +); + +#keyset[id] +unresolved_decl_ref_expr_names( //dir=expr + int id: @unresolved_decl_ref_expr ref, + string name: string ref +); + +unresolved_dot_exprs( //dir=expr + unique int id: @unresolved_dot_expr, + int base: @expr_or_none ref, + string name: string ref +); + +unresolved_member_exprs( //dir=expr + unique int id: @unresolved_member_expr, + string name: string ref +); + +unresolved_pattern_exprs( //dir=expr + unique int id: @unresolved_pattern_expr, + int sub_pattern: @pattern_or_none ref +); + +unresolved_specialize_exprs( //dir=expr + unique int id: @unresolved_specialize_expr, + int sub_expr: @expr_or_none ref +); + +vararg_expansion_exprs( //dir=expr + unique int id: @vararg_expansion_expr, + int sub_expr: @expr_or_none ref +); + +any_hashable_erasure_exprs( //dir=expr + unique int id: @any_hashable_erasure_expr +); + +archetype_to_super_exprs( //dir=expr + unique int id: @archetype_to_super_expr +); + +array_exprs( //dir=expr + unique int id: @array_expr +); + +#keyset[id, index] +array_expr_elements( //dir=expr + int id: @array_expr ref, + int index: int ref, + int element: @expr_or_none ref +); + +array_to_pointer_exprs( //dir=expr + unique int id: @array_to_pointer_expr +); + +auto_closure_exprs( //dir=expr + unique int id: @auto_closure_expr +); + +await_exprs( //dir=expr + unique int id: @await_expr +); + +binary_exprs( //dir=expr + unique int id: @binary_expr +); + +bridge_from_obj_c_exprs( //dir=expr + unique int id: @bridge_from_obj_c_expr +); + +bridge_to_obj_c_exprs( //dir=expr + unique int id: @bridge_to_obj_c_expr +); + +@builtin_literal_expr = + @boolean_literal_expr +| @magic_identifier_literal_expr +| @number_literal_expr +| @string_literal_expr +; + +call_exprs( //dir=expr + unique int id: @call_expr +); + +@checked_cast_expr = + @conditional_checked_cast_expr +| @forced_checked_cast_expr +| @is_expr +; + +class_metatype_to_object_exprs( //dir=expr + unique int id: @class_metatype_to_object_expr +); + +closure_exprs( //dir=expr + unique int id: @closure_expr +); + +coerce_exprs( //dir=expr + unique int id: @coerce_expr +); + +collection_upcast_conversion_exprs( //dir=expr + unique int id: @collection_upcast_conversion_expr +); + +conditional_bridge_from_obj_c_exprs( //dir=expr + unique int id: @conditional_bridge_from_obj_c_expr +); + +covariant_function_conversion_exprs( //dir=expr + unique int id: @covariant_function_conversion_expr +); + +covariant_return_conversion_exprs( //dir=expr + unique int id: @covariant_return_conversion_expr +); + +derived_to_base_exprs( //dir=expr + unique int id: @derived_to_base_expr +); + +destructure_tuple_exprs( //dir=expr + unique int id: @destructure_tuple_expr +); + +dictionary_exprs( //dir=expr + unique int id: @dictionary_expr +); + +#keyset[id, index] +dictionary_expr_elements( //dir=expr + int id: @dictionary_expr ref, + int index: int ref, + int element: @expr_or_none ref +); + +differentiable_function_exprs( //dir=expr + unique int id: @differentiable_function_expr +); + +differentiable_function_extract_original_exprs( //dir=expr + unique int id: @differentiable_function_extract_original_expr +); + +dot_self_exprs( //dir=expr + unique int id: @dot_self_expr +); + +@dynamic_lookup_expr = + @dynamic_member_ref_expr +| @dynamic_subscript_expr +; + +erasure_exprs( //dir=expr + unique int id: @erasure_expr +); + +existential_metatype_to_object_exprs( //dir=expr + unique int id: @existential_metatype_to_object_expr +); + +force_try_exprs( //dir=expr + unique int id: @force_try_expr +); + +foreign_object_conversion_exprs( //dir=expr + unique int id: @foreign_object_conversion_expr +); + +function_conversion_exprs( //dir=expr + unique int id: @function_conversion_expr +); + +in_out_to_pointer_exprs( //dir=expr + unique int id: @in_out_to_pointer_expr +); + +inject_into_optional_exprs( //dir=expr + unique int id: @inject_into_optional_expr +); + +interpolated_string_literal_exprs( //dir=expr + unique int id: @interpolated_string_literal_expr +); + +#keyset[id] +interpolated_string_literal_expr_interpolation_exprs( //dir=expr + int id: @interpolated_string_literal_expr ref, + int interpolation_expr: @opaque_value_expr_or_none ref +); + +#keyset[id] +interpolated_string_literal_expr_interpolation_count_exprs( //dir=expr + int id: @interpolated_string_literal_expr ref, + int interpolation_count_expr: @expr_or_none ref +); + +#keyset[id] +interpolated_string_literal_expr_literal_capacity_exprs( //dir=expr + int id: @interpolated_string_literal_expr ref, + int literal_capacity_expr: @expr_or_none ref +); + +#keyset[id] +interpolated_string_literal_expr_appending_exprs( //dir=expr + int id: @interpolated_string_literal_expr ref, + int appending_expr: @tap_expr_or_none ref +); + +linear_function_exprs( //dir=expr + unique int id: @linear_function_expr +); + +linear_function_extract_original_exprs( //dir=expr + unique int id: @linear_function_extract_original_expr +); + +linear_to_differentiable_function_exprs( //dir=expr + unique int id: @linear_to_differentiable_function_expr +); + +load_exprs( //dir=expr + unique int id: @load_expr +); + +member_ref_exprs( //dir=expr + unique int id: @member_ref_expr +); + +#keyset[id] +member_ref_expr_has_direct_to_storage_semantics( //dir=expr + int id: @member_ref_expr ref +); + +#keyset[id] +member_ref_expr_has_direct_to_implementation_semantics( //dir=expr + int id: @member_ref_expr ref +); + +#keyset[id] +member_ref_expr_has_ordinary_semantics( //dir=expr + int id: @member_ref_expr ref +); + +metatype_conversion_exprs( //dir=expr + unique int id: @metatype_conversion_expr +); + +method_ref_exprs( //dir=expr + unique int id: @method_ref_expr +); + +nil_literal_exprs( //dir=expr + unique int id: @nil_literal_expr +); + +object_literal_exprs( //dir=expr + unique int id: @object_literal_expr, + int kind: int ref +); + +#keyset[id, index] +object_literal_expr_arguments( //dir=expr + int id: @object_literal_expr ref, + int index: int ref, + int argument: @argument_or_none ref +); + +optional_try_exprs( //dir=expr + unique int id: @optional_try_expr +); + +paren_exprs( //dir=expr + unique int id: @paren_expr +); + +pointer_to_pointer_exprs( //dir=expr + unique int id: @pointer_to_pointer_expr +); + +postfix_unary_exprs( //dir=expr + unique int id: @postfix_unary_expr +); + +prefix_unary_exprs( //dir=expr + unique int id: @prefix_unary_expr +); + +protocol_metatype_to_object_exprs( //dir=expr + unique int id: @protocol_metatype_to_object_expr +); + +regex_literal_exprs( //dir=expr + unique int id: @regex_literal_expr +); + +@self_apply_expr = + @constructor_ref_call_expr +| @dot_syntax_call_expr +; + +#keyset[id] +self_apply_exprs( //dir=expr + int id: @self_apply_expr ref, + int base: @expr_or_none ref +); + +string_to_pointer_exprs( //dir=expr + unique int id: @string_to_pointer_expr +); + +subscript_exprs( //dir=expr + unique int id: @subscript_expr +); + +#keyset[id, index] +subscript_expr_arguments( //dir=expr + int id: @subscript_expr ref, + int index: int ref, + int argument: @argument_or_none ref +); + +#keyset[id] +subscript_expr_has_direct_to_storage_semantics( //dir=expr + int id: @subscript_expr ref +); + +#keyset[id] +subscript_expr_has_direct_to_implementation_semantics( //dir=expr + int id: @subscript_expr ref +); + +#keyset[id] +subscript_expr_has_ordinary_semantics( //dir=expr + int id: @subscript_expr ref +); + +try_exprs( //dir=expr + unique int id: @try_expr +); + +underlying_to_opaque_exprs( //dir=expr + unique int id: @underlying_to_opaque_expr +); + +unevaluated_instance_exprs( //dir=expr + unique int id: @unevaluated_instance_expr +); + +unresolved_member_chain_result_exprs( //dir=expr + unique int id: @unresolved_member_chain_result_expr +); + +unresolved_type_conversion_exprs( //dir=expr + unique int id: @unresolved_type_conversion_expr +); + +boolean_literal_exprs( //dir=expr + unique int id: @boolean_literal_expr, + boolean value: boolean ref +); + +conditional_checked_cast_exprs( //dir=expr + unique int id: @conditional_checked_cast_expr +); + +constructor_ref_call_exprs( //dir=expr + unique int id: @constructor_ref_call_expr +); + +dot_syntax_call_exprs( //dir=expr + unique int id: @dot_syntax_call_expr +); + +dynamic_member_ref_exprs( //dir=expr + unique int id: @dynamic_member_ref_expr +); + +dynamic_subscript_exprs( //dir=expr + unique int id: @dynamic_subscript_expr +); + +forced_checked_cast_exprs( //dir=expr + unique int id: @forced_checked_cast_expr +); + +is_exprs( //dir=expr + unique int id: @is_expr +); + +magic_identifier_literal_exprs( //dir=expr + unique int id: @magic_identifier_literal_expr, + string kind: string ref +); + +@number_literal_expr = + @float_literal_expr +| @integer_literal_expr +; + +string_literal_exprs( //dir=expr + unique int id: @string_literal_expr, + string value: string ref +); + +float_literal_exprs( //dir=expr + unique int id: @float_literal_expr, + string string_value: string ref +); + +integer_literal_exprs( //dir=expr + unique int id: @integer_literal_expr, + string string_value: string ref +); + +@pattern = + @any_pattern +| @binding_pattern +| @bool_pattern +| @enum_element_pattern +| @expr_pattern +| @is_pattern +| @named_pattern +| @optional_some_pattern +| @paren_pattern +| @tuple_pattern +| @typed_pattern +; + +any_patterns( //dir=pattern + unique int id: @any_pattern +); + +binding_patterns( //dir=pattern + unique int id: @binding_pattern, + int sub_pattern: @pattern_or_none ref +); + +bool_patterns( //dir=pattern + unique int id: @bool_pattern, + boolean value: boolean ref +); + +enum_element_patterns( //dir=pattern + unique int id: @enum_element_pattern, + int element: @enum_element_decl_or_none ref +); + +#keyset[id] +enum_element_pattern_sub_patterns( //dir=pattern + int id: @enum_element_pattern ref, + int sub_pattern: @pattern_or_none ref +); + +expr_patterns( //dir=pattern + unique int id: @expr_pattern, + int sub_expr: @expr_or_none ref +); + +is_patterns( //dir=pattern + unique int id: @is_pattern +); + +#keyset[id] +is_pattern_cast_type_reprs( //dir=pattern + int id: @is_pattern ref, + int cast_type_repr: @type_repr_or_none ref +); + +#keyset[id] +is_pattern_sub_patterns( //dir=pattern + int id: @is_pattern ref, + int sub_pattern: @pattern_or_none ref +); + +named_patterns( //dir=pattern + unique int id: @named_pattern, + string name: string ref +); + +optional_some_patterns( //dir=pattern + unique int id: @optional_some_pattern, + int sub_pattern: @pattern_or_none ref +); + +paren_patterns( //dir=pattern + unique int id: @paren_pattern, + int sub_pattern: @pattern_or_none ref +); + +tuple_patterns( //dir=pattern + unique int id: @tuple_pattern +); + +#keyset[id, index] +tuple_pattern_elements( //dir=pattern + int id: @tuple_pattern ref, + int index: int ref, + int element: @pattern_or_none ref +); + +typed_patterns( //dir=pattern + unique int id: @typed_pattern, + int sub_pattern: @pattern_or_none ref +); + +#keyset[id] +typed_pattern_type_reprs( //dir=pattern + int id: @typed_pattern ref, + int type_repr: @type_repr_or_none ref +); + +case_label_items( //dir=stmt + unique int id: @case_label_item, + int pattern: @pattern_or_none ref +); + +#keyset[id] +case_label_item_guards( //dir=stmt + int id: @case_label_item ref, + int guard: @expr_or_none ref +); + +condition_elements( //dir=stmt + unique int id: @condition_element +); + +#keyset[id] +condition_element_booleans( //dir=stmt + int id: @condition_element ref, + int boolean_: @expr_or_none ref +); + +#keyset[id] +condition_element_patterns( //dir=stmt + int id: @condition_element ref, + int pattern: @pattern_or_none ref +); + +#keyset[id] +condition_element_initializers( //dir=stmt + int id: @condition_element ref, + int initializer: @expr_or_none ref +); + +@stmt = + @brace_stmt +| @break_stmt +| @case_stmt +| @continue_stmt +| @defer_stmt +| @fail_stmt +| @fallthrough_stmt +| @labeled_stmt +| @pound_assert_stmt +| @return_stmt +| @throw_stmt +| @yield_stmt +; + +stmt_conditions( //dir=stmt + unique int id: @stmt_condition +); + +#keyset[id, index] +stmt_condition_elements( //dir=stmt + int id: @stmt_condition ref, + int index: int ref, + int element: @condition_element_or_none ref +); + +brace_stmts( //dir=stmt + unique int id: @brace_stmt +); + +#keyset[id, index] +brace_stmt_elements( //dir=stmt + int id: @brace_stmt ref, + int index: int ref, + int element: @ast_node_or_none ref +); + +break_stmts( //dir=stmt + unique int id: @break_stmt +); + +#keyset[id] +break_stmt_target_names( //dir=stmt + int id: @break_stmt ref, + string target_name: string ref +); + +#keyset[id] +break_stmt_targets( //dir=stmt + int id: @break_stmt ref, + int target: @stmt_or_none ref +); + +case_stmts( //dir=stmt + unique int id: @case_stmt, + int body: @stmt_or_none ref +); + +#keyset[id, index] +case_stmt_labels( //dir=stmt + int id: @case_stmt ref, + int index: int ref, + int label: @case_label_item_or_none ref +); + +#keyset[id, index] +case_stmt_variables( //dir=stmt + int id: @case_stmt ref, + int index: int ref, + int variable: @var_decl_or_none ref +); + +continue_stmts( //dir=stmt + unique int id: @continue_stmt +); + +#keyset[id] +continue_stmt_target_names( //dir=stmt + int id: @continue_stmt ref, + string target_name: string ref +); + +#keyset[id] +continue_stmt_targets( //dir=stmt + int id: @continue_stmt ref, + int target: @stmt_or_none ref +); + +defer_stmts( //dir=stmt + unique int id: @defer_stmt, + int body: @brace_stmt_or_none ref +); + +fail_stmts( //dir=stmt + unique int id: @fail_stmt +); + +fallthrough_stmts( //dir=stmt + unique int id: @fallthrough_stmt, + int fallthrough_source: @case_stmt_or_none ref, + int fallthrough_dest: @case_stmt_or_none ref +); + +@labeled_stmt = + @do_catch_stmt +| @do_stmt +| @for_each_stmt +| @labeled_conditional_stmt +| @repeat_while_stmt +| @switch_stmt +; + +#keyset[id] +labeled_stmt_labels( //dir=stmt + int id: @labeled_stmt ref, + string label: string ref +); + +pound_assert_stmts( //dir=stmt + unique int id: @pound_assert_stmt, + int condition: @expr_or_none ref, + string message: string ref +); + +return_stmts( //dir=stmt + unique int id: @return_stmt +); + +#keyset[id] +return_stmt_results( //dir=stmt + int id: @return_stmt ref, + int result: @expr_or_none ref +); + +throw_stmts( //dir=stmt + unique int id: @throw_stmt, + int sub_expr: @expr_or_none ref +); + +yield_stmts( //dir=stmt + unique int id: @yield_stmt +); + +#keyset[id, index] +yield_stmt_results( //dir=stmt + int id: @yield_stmt ref, + int index: int ref, + int result: @expr_or_none ref +); + +do_catch_stmts( //dir=stmt + unique int id: @do_catch_stmt, + int body: @stmt_or_none ref +); + +#keyset[id, index] +do_catch_stmt_catches( //dir=stmt + int id: @do_catch_stmt ref, + int index: int ref, + int catch: @case_stmt_or_none ref +); + +do_stmts( //dir=stmt + unique int id: @do_stmt, + int body: @brace_stmt_or_none ref +); + +for_each_stmts( //dir=stmt + unique int id: @for_each_stmt, + int pattern: @pattern_or_none ref, + int sequence: @expr_or_none ref, + int body: @brace_stmt_or_none ref +); + +#keyset[id] +for_each_stmt_wheres( //dir=stmt + int id: @for_each_stmt ref, + int where: @expr_or_none ref +); + +@labeled_conditional_stmt = + @guard_stmt +| @if_stmt +| @while_stmt +; + +#keyset[id] +labeled_conditional_stmts( //dir=stmt + int id: @labeled_conditional_stmt ref, + int condition: @stmt_condition_or_none ref +); + +repeat_while_stmts( //dir=stmt + unique int id: @repeat_while_stmt, + int condition: @expr_or_none ref, + int body: @stmt_or_none ref +); + +switch_stmts( //dir=stmt + unique int id: @switch_stmt, + int expr: @expr_or_none ref +); + +#keyset[id, index] +switch_stmt_cases( //dir=stmt + int id: @switch_stmt ref, + int index: int ref, + int case_: @case_stmt_or_none ref +); + +guard_stmts( //dir=stmt + unique int id: @guard_stmt, + int body: @brace_stmt_or_none ref +); + +if_stmts( //dir=stmt + unique int id: @if_stmt, + int then: @stmt_or_none ref +); + +#keyset[id] +if_stmt_elses( //dir=stmt + int id: @if_stmt ref, + int else: @stmt_or_none ref +); + +while_stmts( //dir=stmt + unique int id: @while_stmt, + int body: @stmt_or_none ref +); + +@type = + @any_function_type +| @any_generic_type +| @any_metatype_type +| @builtin_type +| @dependent_member_type +| @dynamic_self_type +| @error_type +| @existential_type +| @in_out_type +| @l_value_type +| @module_type +| @parameterized_protocol_type +| @protocol_composition_type +| @reference_storage_type +| @substitutable_type +| @sugar_type +| @tuple_type +| @unresolved_type +; + +#keyset[id] +types( //dir=type + int id: @type ref, + string name: string ref, + int canonical_type: @type_or_none ref +); + +type_reprs( //dir=type + unique int id: @type_repr, + int type_: @type_or_none ref +); + +@any_function_type = + @function_type +| @generic_function_type +; + +#keyset[id] +any_function_types( //dir=type + int id: @any_function_type ref, + int result: @type_or_none ref +); + +#keyset[id, index] +any_function_type_param_types( //dir=type + int id: @any_function_type ref, + int index: int ref, + int param_type: @type_or_none ref +); + +#keyset[id, index] +any_function_type_param_labels( //dir=type + int id: @any_function_type ref, + int index: int ref, + string param_label: string ref +); + +#keyset[id] +any_function_type_is_throwing( //dir=type + int id: @any_function_type ref +); + +#keyset[id] +any_function_type_is_async( //dir=type + int id: @any_function_type ref +); + +@any_generic_type = + @nominal_or_bound_generic_nominal_type +| @unbound_generic_type +; + +#keyset[id] +any_generic_types( //dir=type + int id: @any_generic_type ref, + int declaration: @decl_or_none ref +); + +#keyset[id] +any_generic_type_parents( //dir=type + int id: @any_generic_type ref, + int parent: @type_or_none ref +); + +@any_metatype_type = + @existential_metatype_type +| @metatype_type +; + +@builtin_type = + @any_builtin_integer_type +| @builtin_bridge_object_type +| @builtin_default_actor_storage_type +| @builtin_executor_type +| @builtin_float_type +| @builtin_job_type +| @builtin_native_object_type +| @builtin_raw_pointer_type +| @builtin_raw_unsafe_continuation_type +| @builtin_unsafe_value_buffer_type +| @builtin_vector_type +; + +dependent_member_types( //dir=type + unique int id: @dependent_member_type, + int base_type: @type_or_none ref, + int associated_type_decl: @associated_type_decl_or_none ref +); + +dynamic_self_types( //dir=type + unique int id: @dynamic_self_type, + int static_self_type: @type_or_none ref +); + +error_types( //dir=type + unique int id: @error_type +); + +existential_types( //dir=type + unique int id: @existential_type, + int constraint: @type_or_none ref +); + +in_out_types( //dir=type + unique int id: @in_out_type, + int object_type: @type_or_none ref +); + +l_value_types( //dir=type + unique int id: @l_value_type, + int object_type: @type_or_none ref +); + +module_types( //dir=type + unique int id: @module_type, + int module: @module_decl_or_none ref +); + +parameterized_protocol_types( //dir=type + unique int id: @parameterized_protocol_type, + int base: @protocol_type_or_none ref +); + +#keyset[id, index] +parameterized_protocol_type_args( //dir=type + int id: @parameterized_protocol_type ref, + int index: int ref, + int arg: @type_or_none ref +); + +protocol_composition_types( //dir=type + unique int id: @protocol_composition_type +); + +#keyset[id, index] +protocol_composition_type_members( //dir=type + int id: @protocol_composition_type ref, + int index: int ref, + int member: @type_or_none ref +); + +@reference_storage_type = + @unmanaged_storage_type +| @unowned_storage_type +| @weak_storage_type +; + +#keyset[id] +reference_storage_types( //dir=type + int id: @reference_storage_type ref, + int referent_type: @type_or_none ref +); + +@substitutable_type = + @archetype_type +| @generic_type_param_type +; + +@sugar_type = + @paren_type +| @syntax_sugar_type +| @type_alias_type +; + +tuple_types( //dir=type + unique int id: @tuple_type +); + +#keyset[id, index] +tuple_type_types( //dir=type + int id: @tuple_type ref, + int index: int ref, + int type_: @type_or_none ref +); + +#keyset[id, index] +tuple_type_names( //dir=type + int id: @tuple_type ref, + int index: int ref, + string name: string ref +); + +unresolved_types( //dir=type + unique int id: @unresolved_type +); + +@any_builtin_integer_type = + @builtin_integer_literal_type +| @builtin_integer_type +; + +@archetype_type = + @opaque_type_archetype_type +| @opened_archetype_type +| @primary_archetype_type +; + +#keyset[id] +archetype_types( //dir=type + int id: @archetype_type ref, + int interface_type: @type_or_none ref +); + +#keyset[id] +archetype_type_superclasses( //dir=type + int id: @archetype_type ref, + int superclass: @type_or_none ref +); + +#keyset[id, index] +archetype_type_protocols( //dir=type + int id: @archetype_type ref, + int index: int ref, + int protocol: @protocol_decl_or_none ref +); + +builtin_bridge_object_types( //dir=type + unique int id: @builtin_bridge_object_type +); + +builtin_default_actor_storage_types( //dir=type + unique int id: @builtin_default_actor_storage_type +); + +builtin_executor_types( //dir=type + unique int id: @builtin_executor_type +); + +builtin_float_types( //dir=type + unique int id: @builtin_float_type +); + +builtin_job_types( //dir=type + unique int id: @builtin_job_type +); + +builtin_native_object_types( //dir=type + unique int id: @builtin_native_object_type +); + +builtin_raw_pointer_types( //dir=type + unique int id: @builtin_raw_pointer_type +); + +builtin_raw_unsafe_continuation_types( //dir=type + unique int id: @builtin_raw_unsafe_continuation_type +); + +builtin_unsafe_value_buffer_types( //dir=type + unique int id: @builtin_unsafe_value_buffer_type +); + +builtin_vector_types( //dir=type + unique int id: @builtin_vector_type +); + +existential_metatype_types( //dir=type + unique int id: @existential_metatype_type +); + +function_types( //dir=type + unique int id: @function_type +); + +generic_function_types( //dir=type + unique int id: @generic_function_type +); + +#keyset[id, index] +generic_function_type_generic_params( //dir=type + int id: @generic_function_type ref, + int index: int ref, + int generic_param: @generic_type_param_type_or_none ref +); + +generic_type_param_types( //dir=type + unique int id: @generic_type_param_type +); + +metatype_types( //dir=type + unique int id: @metatype_type +); + +@nominal_or_bound_generic_nominal_type = + @bound_generic_type +| @nominal_type +; + +paren_types( //dir=type + unique int id: @paren_type, + int type_: @type_or_none ref +); + +@syntax_sugar_type = + @dictionary_type +| @unary_syntax_sugar_type +; + +type_alias_types( //dir=type + unique int id: @type_alias_type, + int decl: @type_alias_decl_or_none ref +); + +unbound_generic_types( //dir=type + unique int id: @unbound_generic_type +); + +unmanaged_storage_types( //dir=type + unique int id: @unmanaged_storage_type +); + +unowned_storage_types( //dir=type + unique int id: @unowned_storage_type +); + +weak_storage_types( //dir=type + unique int id: @weak_storage_type +); + +@bound_generic_type = + @bound_generic_class_type +| @bound_generic_enum_type +| @bound_generic_struct_type +; + +#keyset[id, index] +bound_generic_type_arg_types( //dir=type + int id: @bound_generic_type ref, + int index: int ref, + int arg_type: @type_or_none ref +); + +builtin_integer_literal_types( //dir=type + unique int id: @builtin_integer_literal_type +); + +builtin_integer_types( //dir=type + unique int id: @builtin_integer_type +); + +#keyset[id] +builtin_integer_type_widths( //dir=type + int id: @builtin_integer_type ref, + int width: int ref +); + +dictionary_types( //dir=type + unique int id: @dictionary_type, + int key_type: @type_or_none ref, + int value_type: @type_or_none ref +); + +@nominal_type = + @class_type +| @enum_type +| @protocol_type +| @struct_type +; + +opaque_type_archetype_types( //dir=type + unique int id: @opaque_type_archetype_type, + int declaration: @opaque_type_decl_or_none ref +); + +opened_archetype_types( //dir=type + unique int id: @opened_archetype_type +); + +primary_archetype_types( //dir=type + unique int id: @primary_archetype_type +); + +@unary_syntax_sugar_type = + @array_slice_type +| @optional_type +| @variadic_sequence_type +; + +#keyset[id] +unary_syntax_sugar_types( //dir=type + int id: @unary_syntax_sugar_type ref, + int base_type: @type_or_none ref +); + +array_slice_types( //dir=type + unique int id: @array_slice_type +); + +bound_generic_class_types( //dir=type + unique int id: @bound_generic_class_type +); + +bound_generic_enum_types( //dir=type + unique int id: @bound_generic_enum_type +); + +bound_generic_struct_types( //dir=type + unique int id: @bound_generic_struct_type +); + +class_types( //dir=type + unique int id: @class_type +); + +enum_types( //dir=type + unique int id: @enum_type +); + +optional_types( //dir=type + unique int id: @optional_type +); + +protocol_types( //dir=type + unique int id: @protocol_type +); + +struct_types( //dir=type + unique int id: @struct_type +); + +variadic_sequence_types( //dir=type + unique int id: @variadic_sequence_type +); + +@abstract_function_decl_or_none = + @abstract_function_decl +| @unspecified_element +; + +@accessor_decl_or_none = + @accessor_decl +| @unspecified_element +; + +@argument_or_none = + @argument +| @unspecified_element +; + +@associated_type_decl_or_none = + @associated_type_decl +| @unspecified_element +; + +@ast_node_or_none = + @ast_node +| @unspecified_element +; + +@brace_stmt_or_none = + @brace_stmt +| @unspecified_element +; + +@case_label_item_or_none = + @case_label_item +| @unspecified_element +; + +@case_stmt_or_none = + @case_stmt +| @unspecified_element +; + +@closure_expr_or_none = + @closure_expr +| @unspecified_element +; + +@condition_element_or_none = + @condition_element +| @unspecified_element +; + +@constructor_decl_or_none = + @constructor_decl +| @unspecified_element +; + +@decl_or_none = + @decl +| @unspecified_element +; + +@enum_element_decl_or_none = + @enum_element_decl +| @unspecified_element +; + +@expr_or_none = + @expr +| @unspecified_element +; + +@file_or_none = + @file +| @unspecified_element +; + +@generic_type_param_decl_or_none = + @generic_type_param_decl +| @unspecified_element +; + +@generic_type_param_type_or_none = + @generic_type_param_type +| @unspecified_element +; + +@location_or_none = + @location +| @unspecified_element +; + +@module_decl_or_none = + @module_decl +| @unspecified_element +; + +@nominal_type_decl_or_none = + @nominal_type_decl +| @unspecified_element +; + +@opaque_type_decl_or_none = + @opaque_type_decl +| @unspecified_element +; + +@opaque_value_expr_or_none = + @opaque_value_expr +| @unspecified_element +; + +@param_decl_or_none = + @param_decl +| @unspecified_element +; + +@pattern_or_none = + @pattern +| @unspecified_element +; + +@pattern_binding_decl_or_none = + @pattern_binding_decl +| @unspecified_element +; + +@precedence_group_decl_or_none = + @precedence_group_decl +| @unspecified_element +; + +@protocol_decl_or_none = + @protocol_decl +| @unspecified_element +; + +@protocol_type_or_none = + @protocol_type +| @unspecified_element +; + +@stmt_or_none = + @stmt +| @unspecified_element +; + +@stmt_condition_or_none = + @stmt_condition +| @unspecified_element +; + +@string_literal_expr_or_none = + @string_literal_expr +| @unspecified_element +; + +@tap_expr_or_none = + @tap_expr +| @unspecified_element +; + +@type_or_none = + @type +| @unspecified_element +; + +@type_alias_decl_or_none = + @type_alias_decl +| @unspecified_element +; + +@type_repr_or_none = + @type_repr +| @unspecified_element +; + +@value_decl_or_none = + @unspecified_element +| @value_decl +; + +@var_decl_or_none = + @unspecified_element +| @var_decl +; diff --git a/swift/downgrades/qlpack.yml b/swift/downgrades/qlpack.yml new file mode 100644 index 00000000000..3fc919124df --- /dev/null +++ b/swift/downgrades/qlpack.yml @@ -0,0 +1,4 @@ +name: codeql/swift-downgrades +groups: swift +downgrades: . +library: true diff --git a/swift/extractor/infra/SwiftTagTraits.h b/swift/extractor/infra/SwiftTagTraits.h index 1f976557b54..78b7bcc530e 100644 --- a/swift/extractor/infra/SwiftTagTraits.h +++ b/swift/extractor/infra/SwiftTagTraits.h @@ -9,25 +9,42 @@ namespace codeql { -// codegen goes with QL acronym convention (Sil instead of SIL), we need to remap it to Swift's -// convention -using SILBlockStorageTypeTag = SilBlockStorageTypeTag; -using SILBoxTypeTag = SilBoxTypeTag; -using SILFunctionTypeTag = SilFunctionTypeTag; -using SILTokenTypeTag = SilTokenTypeTag; - // OverloadSetRefExpr is collapsed with its only derived class OverloadedDeclRefExpr using OverloadSetRefExprTag = OverloadedDeclRefExprTag; +// We don't really expect to see the following in extraction. Mapping these tags to void effectively +// ignores all elements of that class (with a message). + +// only generated for code editing +using CodeCompletionExprTag = void; +using EditorPlaceholderExprTag = void; +// not present after the Sema phase +using ArrowExprTag = void; +// experimental variadic generics, implemented only in the frontend for now, thus not compilable +using PackExprTag = void; +using PackTypeTag = void; +using ReifyPackExprTag = void; +using PackExpansionTypeTag = void; +using SequenceArchetypeTypeTag = void; +// Placeholder types appear in ambiguous types but are anyway transformed to UnresolvedType +using PlaceholderTypeTag = void; +// SIL types that cannot really appear in the frontend run +using SILBlockStorageTypeTag = void; +using SILBoxTypeTag = void; +using SILFunctionTypeTag = void; +using SILTokenTypeTag = void; +// This is created during type checking and is only used for constraint checking +using TypeVariableTypeTag = void; + #define MAP_TYPE_TO_TAG(TYPE, TAG) \ template <> \ struct detail::ToTagFunctor { \ using type = TAG; \ } #define MAP_TAG(TYPE) MAP_TYPE_TO_TAG(swift::TYPE, TYPE##Tag) -#define MAP_SUBTAG(TYPE, PARENT) \ - MAP_TAG(TYPE); \ - static_assert(std::is_base_of_v , \ +#define MAP_SUBTAG(TYPE, PARENT) \ + MAP_TAG(TYPE); \ + static_assert(std::is_same_v || std::is_base_of_v , \ #PARENT "Tag must be a base of " #TYPE "Tag"); #define OVERRIDE_TAG(TYPE, TAG) \ diff --git a/swift/extractor/print_unextracted/main.cpp b/swift/extractor/print_unextracted/main.cpp index 9496abd752e..09832ce992e 100644 --- a/swift/extractor/print_unextracted/main.cpp +++ b/swift/extractor/print_unextracted/main.cpp @@ -1,4 +1,6 @@ #include +#include +#include