From 5fbcbbc584c208642d08f4346e372c849d80bc39 Mon Sep 17 00:00:00 2001 From: erik-krogh Date: Tue, 1 Nov 2022 12:00:57 +0100 Subject: [PATCH] move existing regex-tree into a module --- python/ql/lib/semmle/python/RegexTreeView.qll | 1874 +++++++++-------- 1 file changed, 939 insertions(+), 935 deletions(-) diff --git a/python/ql/lib/semmle/python/RegexTreeView.qll b/python/ql/lib/semmle/python/RegexTreeView.qll index 808a784d472..aa884c56b61 100644 --- a/python/ql/lib/semmle/python/RegexTreeView.qll +++ b/python/ql/lib/semmle/python/RegexTreeView.qll @@ -2,6 +2,10 @@ import python private import semmle.python.regex +import Impl + +/** Gets the parse tree resulting from parsing `re`, if such has been constructed. */ +RegExpTerm getParsedRegExp(StrConst re) { result.getRegex() = re and result.isRootTerm() } /** * An element containing a regular expression term, that is, either @@ -12,7 +16,7 @@ private import semmle.python.regex * Otherwise, we wish to represent the term differently. * This avoids multiple representations of the same term. */ -newtype TRegExpParent = +private newtype TRegExpParent = /** A string literal used as a regular expression */ TRegExpLiteral(Regex re) or /** A quantified term */ @@ -48,982 +52,982 @@ newtype TRegExpParent = /** A back reference */ TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) } -/** - * An element containing a regular expression term, that is, either - * a string literal (parsed as a regular expression) - * or another regular expression term. - */ -class RegExpParent extends TRegExpParent { - /** Gets a textual representation of this element. */ - string toString() { result = "RegExpParent" } +/** An implementation that statisfies the RegexTreeView signature. */ +module Impl { + /** + * An element containing a regular expression term, that is, either + * a string literal (parsed as a regular expression) + * or another regular expression term. + */ + class RegExpParent extends TRegExpParent { + /** Gets a textual representation of this element. */ + string toString() { result = "RegExpParent" } - /** Gets the `i`th child term. */ - abstract RegExpTerm getChild(int i); + /** Gets the `i`th child term. */ + abstract RegExpTerm getChild(int i); - /** Gets a child term . */ - RegExpTerm getAChild() { result = this.getChild(_) } + /** Gets a child term . */ + RegExpTerm getAChild() { result = this.getChild(_) } - /** Gets the number of child terms. */ - int getNumChild() { result = count(this.getAChild()) } + /** Gets the number of child terms. */ + int getNumChild() { result = count(this.getAChild()) } - /** Gets the associated regex. */ - abstract Regex getRegex(); -} + /** Gets the associated regex. */ + abstract Regex getRegex(); + } -/** A string literal used as a regular expression */ -class RegExpLiteral extends TRegExpLiteral, RegExpParent { - Regex re; + /** A string literal used as a regular expression */ + class RegExpLiteral extends TRegExpLiteral, RegExpParent { + Regex re; - RegExpLiteral() { this = TRegExpLiteral(re) } + RegExpLiteral() { this = TRegExpLiteral(re) } - override RegExpTerm getChild(int i) { i = 0 and result.getRegex() = re and result.isRootTerm() } + override RegExpTerm getChild(int i) { i = 0 and result.getRegex() = re and result.isRootTerm() } - /** Holds if dot, `.`, matches all characters, including newlines. */ - predicate isDotAll() { re.getAMode() = "DOTALL" } + /** Holds if dot, `.`, matches all characters, including newlines. */ + predicate isDotAll() { re.getAMode() = "DOTALL" } - /** Holds if this regex matching is case-insensitive for this regex. */ - predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" } + /** Holds if this regex matching is case-insensitive for this regex. */ + predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" } - /** Get a string representing all modes for this regex. */ - string getFlags() { result = concat(string mode | mode = re.getAMode() | mode, " | ") } + /** Get a string representing all modes for this regex. */ + string getFlags() { result = concat(string mode | mode = re.getAMode() | mode, " | ") } - override Regex getRegex() { result = re } + override Regex getRegex() { result = re } - /** Gets the primary QL class for this regex. */ - string getPrimaryQLClass() { result = "RegExpLiteral" } -} - -/** - * A regular expression term, that is, a syntactic part of a regular expression. - */ -class RegExpTerm extends RegExpParent { - Regex re; - int start; - int end; - - RegExpTerm() { - this = TRegExpAlt(re, start, end) - or - this = TRegExpBackRef(re, start, end) - or - this = TRegExpCharacterClass(re, start, end) - or - this = TRegExpCharacterRange(re, start, end) - or - this = TRegExpNormalChar(re, start, end) - or - this = TRegExpGroup(re, start, end) - or - this = TRegExpQuantifier(re, start, end) - or - this = TRegExpSequence(re, start, end) - or - this = TRegExpSpecialChar(re, start, end) + /** Gets the primary QL class for this regex. */ + string getPrimaryQLClass() { result = "RegExpLiteral" } } /** - * Gets the outermost term of this regular expression. + * A regular expression term, that is, a syntactic part of a regular expression. */ - RegExpTerm getRootTerm() { - this.isRootTerm() and result = this - or - result = this.getParent().(RegExpTerm).getRootTerm() - } + class RegExpTerm extends RegExpParent { + Regex re; + int start; + int end; - /** - * Holds if this term is part of a string literal - * that is interpreted as a regular expression. - */ - predicate isUsedAsRegExp() { any() } - - /** - * Holds if this is the root term of a regular expression. - */ - predicate isRootTerm() { start = 0 and end = re.getText().length() } - - override RegExpTerm getChild(int i) { - result = this.(RegExpAlt).getChild(i) - or - result = this.(RegExpBackRef).getChild(i) - or - result = this.(RegExpCharacterClass).getChild(i) - or - result = this.(RegExpCharacterRange).getChild(i) - or - result = this.(RegExpNormalChar).getChild(i) - or - result = this.(RegExpGroup).getChild(i) - or - result = this.(RegExpQuantifier).getChild(i) - or - result = this.(RegExpSequence).getChild(i) - or - result = this.(RegExpSpecialChar).getChild(i) - } - - /** - * Gets the parent term of this regular expression term, or the - * regular expression literal if this is the root term. - */ - RegExpParent getParent() { result.getAChild() = this } - - override Regex getRegex() { result = re } - - /** Gets the offset at which this term starts. */ - int getStart() { result = start } - - /** Gets the offset at which this term ends. */ - int getEnd() { result = end } - - override string toString() { result = re.getText().substring(start, end) } - - /** - * Gets the location of the surrounding regex, as locations inside the regex do not exist. - * To get location information corresponding to the term inside the regex, - * use `hasLocationInfo`. - */ - Location getLocation() { result = re.getLocation() } - - /** Holds if this term is found at the specified location offsets. */ - predicate hasLocationInfo( - string filepath, int startline, int startcolumn, int endline, int endcolumn - ) { - exists(int re_start, int re_end | - re.getLocation().hasLocationInfo(filepath, startline, re_start, endline, re_end) and - startcolumn = re_start + start + 4 and - endcolumn = re_start + end + 3 - ) - } - - /** Gets the file in which this term is found. */ - File getFile() { result = this.getLocation().getFile() } - - /** Gets the raw source text of this term. */ - string getRawValue() { result = this.toString() } - - /** Gets the string literal in which this term is found. */ - RegExpLiteral getLiteral() { result = TRegExpLiteral(re) } - - /** Gets the regular expression term that is matched (textually) before this one, if any. */ - RegExpTerm getPredecessor() { - exists(RegExpTerm parent | parent = this.getParent() | - result = parent.(RegExpSequence).previousElement(this) + RegExpTerm() { + this = TRegExpAlt(re, start, end) or - not exists(parent.(RegExpSequence).previousElement(this)) and - not parent instanceof RegExpSubPattern and - result = parent.getPredecessor() - ) - } - - /** Gets the regular expression term that is matched (textually) after this one, if any. */ - RegExpTerm getSuccessor() { - exists(RegExpTerm parent | parent = this.getParent() | - result = parent.(RegExpSequence).nextElement(this) + this = TRegExpBackRef(re, start, end) or - not exists(parent.(RegExpSequence).nextElement(this)) and - not parent instanceof RegExpSubPattern and - result = parent.getSuccessor() - ) + this = TRegExpCharacterClass(re, start, end) + or + this = TRegExpCharacterRange(re, start, end) + or + this = TRegExpNormalChar(re, start, end) + or + this = TRegExpGroup(re, start, end) + or + this = TRegExpQuantifier(re, start, end) + or + this = TRegExpSequence(re, start, end) + or + this = TRegExpSpecialChar(re, start, end) + } + + /** + * Gets the outermost term of this regular expression. + */ + RegExpTerm getRootTerm() { + this.isRootTerm() and result = this + or + result = this.getParent().(RegExpTerm).getRootTerm() + } + + /** + * Holds if this term is part of a string literal + * that is interpreted as a regular expression. + */ + predicate isUsedAsRegExp() { any() } + + /** + * Holds if this is the root term of a regular expression. + */ + predicate isRootTerm() { start = 0 and end = re.getText().length() } + + override RegExpTerm getChild(int i) { + result = this.(RegExpAlt).getChild(i) + or + result = this.(RegExpBackRef).getChild(i) + or + result = this.(RegExpCharacterClass).getChild(i) + or + result = this.(RegExpCharacterRange).getChild(i) + or + result = this.(RegExpNormalChar).getChild(i) + or + result = this.(RegExpGroup).getChild(i) + or + result = this.(RegExpQuantifier).getChild(i) + or + result = this.(RegExpSequence).getChild(i) + or + result = this.(RegExpSpecialChar).getChild(i) + } + + /** + * Gets the parent term of this regular expression term, or the + * regular expression literal if this is the root term. + */ + RegExpParent getParent() { result.getAChild() = this } + + override Regex getRegex() { result = re } + + /** Gets the offset at which this term starts. */ + int getStart() { result = start } + + /** Gets the offset at which this term ends. */ + int getEnd() { result = end } + + override string toString() { result = re.getText().substring(start, end) } + + /** + * Gets the location of the surrounding regex, as locations inside the regex do not exist. + * To get location information corresponding to the term inside the regex, + * use `hasLocationInfo`. + */ + Location getLocation() { result = re.getLocation() } + + /** Holds if this term is found at the specified location offsets. */ + predicate hasLocationInfo( + string filepath, int startline, int startcolumn, int endline, int endcolumn + ) { + exists(int re_start, int re_end | + re.getLocation().hasLocationInfo(filepath, startline, re_start, endline, re_end) and + startcolumn = re_start + start + 4 and + endcolumn = re_start + end + 3 + ) + } + + /** Gets the file in which this term is found. */ + File getFile() { result = this.getLocation().getFile() } + + /** Gets the raw source text of this term. */ + string getRawValue() { result = this.toString() } + + /** Gets the string literal in which this term is found. */ + RegExpLiteral getLiteral() { result = TRegExpLiteral(re) } + + /** Gets the regular expression term that is matched (textually) before this one, if any. */ + RegExpTerm getPredecessor() { + exists(RegExpTerm parent | parent = this.getParent() | + result = parent.(RegExpSequence).previousElement(this) + or + not exists(parent.(RegExpSequence).previousElement(this)) and + not parent instanceof RegExpSubPattern and + result = parent.getPredecessor() + ) + } + + /** Gets the regular expression term that is matched (textually) after this one, if any. */ + RegExpTerm getSuccessor() { + exists(RegExpTerm parent | parent = this.getParent() | + result = parent.(RegExpSequence).nextElement(this) + or + not exists(parent.(RegExpSequence).nextElement(this)) and + not parent instanceof RegExpSubPattern and + result = parent.getSuccessor() + ) + } + + /** Gets the primary QL class for this term. */ + string getPrimaryQLClass() { result = "RegExpTerm" } } - /** Gets the primary QL class for this term. */ - string getPrimaryQLClass() { result = "RegExpTerm" } -} - -/** - * A quantified regular expression term. - * - * Example: - * - * ``` - * ((ECMA|Java)[sS]cript)* - * ``` - */ -class RegExpQuantifier extends RegExpTerm, TRegExpQuantifier { - int part_end; - boolean maybe_empty; - boolean may_repeat_forever; - - RegExpQuantifier() { - this = TRegExpQuantifier(re, start, end) and - re.qualifiedPart(start, part_end, end, maybe_empty, may_repeat_forever) - } - - override RegExpTerm getChild(int i) { - i = 0 and - result.getRegex() = re and - result.getStart() = start and - result.getEnd() = part_end - } - - /** Hols if this term may match an unlimited number of times. */ - predicate mayRepeatForever() { may_repeat_forever = true } - - /** Gets the qualifier for this term. That is e.g "?" for "a?". */ - string getQualifier() { result = re.getText().substring(part_end, end) } - - override string getPrimaryQLClass() { result = "RegExpQuantifier" } -} - -/** - * A regular expression term that permits unlimited repetitions. - */ -class InfiniteRepetitionQuantifier extends RegExpQuantifier { - InfiniteRepetitionQuantifier() { this.mayRepeatForever() } -} - -/** - * A star-quantified term. - * - * Example: - * - * ``` - * \w* - * ``` - */ -class RegExpStar extends InfiniteRepetitionQuantifier { - RegExpStar() { this.getQualifier().charAt(0) = "*" } - - override string getPrimaryQLClass() { result = "RegExpStar" } -} - -/** - * A plus-quantified term. - * - * Example: - * - * ``` - * \w+ - * ``` - */ -class RegExpPlus extends InfiniteRepetitionQuantifier { - RegExpPlus() { this.getQualifier().charAt(0) = "+" } - - override string getPrimaryQLClass() { result = "RegExpPlus" } -} - -/** - * An optional term. - * - * Example: - * - * ``` - * ;? - * ``` - */ -class RegExpOpt extends RegExpQuantifier { - RegExpOpt() { this.getQualifier().charAt(0) = "?" } - - override string getPrimaryQLClass() { result = "RegExpOpt" } -} - -/** - * A range-quantified term - * - * Examples: - * - * ``` - * \w{2,4} - * \w{2,} - * \w{2} - * ``` - */ -class RegExpRange extends RegExpQuantifier { - string upper; - string lower; - - RegExpRange() { re.multiples(part_end, end, lower, upper) } - - /** Gets the string defining the upper bound of this range, if any. */ - string getUpper() { result = upper } - - /** Gets the string defining the lower bound of this range, if any. */ - string getLower() { result = lower } - /** - * Gets the upper bound of the range, if any. + * A quantified regular expression term. * - * If there is no upper bound, any number of repetitions is allowed. - * For a term of the form `r{lo}`, both the lower and the upper bound - * are `lo`. - */ - int getUpperBound() { result = this.getUpper().toInt() } - - /** Gets the lower bound of the range. */ - int getLowerBound() { result = this.getLower().toInt() } - - override string getPrimaryQLClass() { result = "RegExpRange" } -} - -/** - * A sequence term. - * - * Example: - * - * ``` - * (ECMA|Java)Script - * ``` - * - * This is a sequence with the elements `(ECMA|Java)` and `Script`. - */ -class RegExpSequence extends RegExpTerm, TRegExpSequence { - RegExpSequence() { this = TRegExpSequence(re, start, end) } - - override RegExpTerm getChild(int i) { result = seqChild(re, start, end, i) } - - /** Gets the element preceding `element` in this sequence. */ - RegExpTerm previousElement(RegExpTerm element) { element = this.nextElement(result) } - - /** Gets the element following `element` in this sequence. */ - RegExpTerm nextElement(RegExpTerm element) { - exists(int i | - element = this.getChild(i) and - result = this.getChild(i + 1) - ) - } - - override string getPrimaryQLClass() { result = "RegExpSequence" } -} - -pragma[nomagic] -private int seqChildEnd(Regex re, int start, int end, int i) { - result = seqChild(re, start, end, i).getEnd() -} - -// moved out so we can use it in the charpred -private RegExpTerm seqChild(Regex re, int start, int end, int i) { - re.sequence(start, end) and - ( - i = 0 and - result.getRegex() = re and - result.getStart() = start and - exists(int itemEnd | - re.item(start, itemEnd) and - result.getEnd() = itemEnd - ) - or - i > 0 and - result.getRegex() = re and - exists(int itemStart | itemStart = seqChildEnd(re, start, end, i - 1) | - result.getStart() = itemStart and - re.item(itemStart, result.getEnd()) - ) - ) -} - -/** - * An alternative term, that is, a term of the form `a|b`. - * - * Example: - * - * ``` - * ECMA|Java - * ``` - */ -class RegExpAlt extends RegExpTerm, TRegExpAlt { - RegExpAlt() { this = TRegExpAlt(re, start, end) } - - override RegExpTerm getChild(int i) { - i = 0 and - result.getRegex() = re and - result.getStart() = start and - exists(int part_end | - re.alternationOption(start, end, start, part_end) and - result.getEnd() = part_end - ) - or - i > 0 and - result.getRegex() = re and - exists(int part_start | - part_start = this.getChild(i - 1).getEnd() + 1 // allow for the | - | - result.getStart() = part_start and - re.alternationOption(start, end, part_start, result.getEnd()) - ) - } - - override string getPrimaryQLClass() { result = "RegExpAlt" } -} - -class RegExpCharEscape = RegExpEscape; - -/** - * An escaped regular expression term, that is, a regular expression - * term starting with a backslash, which is not a backreference. - * - * Example: - * - * ``` - * \. - * \w - * ``` - */ -class RegExpEscape extends RegExpNormalChar { - RegExpEscape() { re.escapedCharacter(start, end) } - - /** - * Gets the name of the escaped; for example, `w` for `\w`. - * TODO: Handle named escapes. - */ - override string getValue() { - not this.isUnicode() and - this.isIdentityEscape() and - result = this.getUnescaped() - or - this.getUnescaped() = "n" and result = "\n" - or - this.getUnescaped() = "r" and result = "\r" - or - this.getUnescaped() = "t" and result = "\t" - or - this.getUnescaped() = "f" and result = 12.toUnicode() - or - this.getUnescaped() = "v" and result = 11.toUnicode() - or - this.isUnicode() and - result = this.getUnicode() - } - - /** Holds if this terms name is given by the part following the escape character. */ - predicate isIdentityEscape() { not this.getUnescaped() in ["n", "r", "t", "f"] } - - override string getPrimaryQLClass() { result = "RegExpEscape" } - - /** Gets the part of the term following the escape character. That is e.g. "w" if the term is "\w". */ - string getUnescaped() { result = this.getText().suffix(1) } - - /** - * Gets the text for this escape. That is e.g. "\w". - */ - private string getText() { result = re.getText().substring(start, end) } - - /** - * Holds if this is a unicode escape. - */ - private predicate isUnicode() { this.getText().prefix(2) = ["\\u", "\\U"] } - - /** - * Gets the unicode char for this escape. - * E.g. for `\u0061` this returns "a". - */ - private string getUnicode() { - exists(int codepoint | codepoint = sum(this.getHexValueFromUnicode(_)) | - result = codepoint.toUnicode() - ) - } - - /** - * Gets int value for the `index`th char in the hex number of the unicode escape. - * E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex). - */ - private int getHexValueFromUnicode(int index) { - this.isUnicode() and - exists(string hex, string char | hex = this.getText().suffix(2) | - char = hex.charAt(index) and - result = 16.pow(hex.length() - index - 1) * toHex(char) - ) - } -} - -/** - * Gets the hex number for the `hex` char. - */ -private int toHex(string hex) { - hex = [0 .. 9].toString() and - result = hex.toInt() - or - result = 10 and hex = ["a", "A"] - or - result = 11 and hex = ["b", "B"] - or - result = 12 and hex = ["c", "C"] - or - result = 13 and hex = ["d", "D"] - or - result = 14 and hex = ["e", "E"] - or - result = 15 and hex = ["f", "F"] -} - -/** - * A word boundary, that is, a regular expression term of the form `\b`. - */ -class RegExpWordBoundary extends RegExpSpecialChar { - RegExpWordBoundary() { this.getChar() = "\\b" } -} - -/** - * A character class escape in a regular expression. - * That is, an escaped character that denotes multiple characters. - * - * Examples: - * - * ``` - * \w - * \S - * ``` - */ -class RegExpCharacterClassEscape extends RegExpEscape { - RegExpCharacterClassEscape() { this.getValue() in ["d", "D", "s", "S", "w", "W"] } - - override RegExpTerm getChild(int i) { none() } - - override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" } -} - -/** - * A character class in a regular expression. - * - * Examples: - * - * ``` - * [a-z_] - * [^<>&] - * ``` - */ -class RegExpCharacterClass extends RegExpTerm, TRegExpCharacterClass { - RegExpCharacterClass() { this = TRegExpCharacterClass(re, start, end) } - - /** Holds if this character class is inverted, matching the opposite of its content. */ - predicate isInverted() { re.getChar(start + 1) = "^" } - - /** Gets the `i`th char inside this charater class. */ - string getCharThing(int i) { result = re.getChar(i + start) } - - /** Holds if this character class can match anything. */ - predicate isUniversalClass() { - // [^] - this.isInverted() and not exists(this.getAChild()) - or - // [\w\W] and similar - not this.isInverted() and - exists(string cce1, string cce2 | - cce1 = this.getAChild().(RegExpCharacterClassEscape).getValue() and - cce2 = this.getAChild().(RegExpCharacterClassEscape).getValue() - | - cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase() - ) - } - - override RegExpTerm getChild(int i) { - i = 0 and - result.getRegex() = re and - exists(int itemStart, int itemEnd | - result.getStart() = itemStart and - re.char_set_start(start, itemStart) and - re.char_set_child(start, itemStart, itemEnd) and - result.getEnd() = itemEnd - ) - or - i > 0 and - result.getRegex() = re and - exists(int itemStart | itemStart = this.getChild(i - 1).getEnd() | - result.getStart() = itemStart and - re.char_set_child(start, itemStart, result.getEnd()) - ) - } - - override string getPrimaryQLClass() { result = "RegExpCharacterClass" } -} - -/** - * A character range in a character class in a regular expression. - * - * Example: - * - * ``` - * a-z - * ``` - */ -class RegExpCharacterRange extends RegExpTerm, TRegExpCharacterRange { - int lower_end; - int upper_start; - - RegExpCharacterRange() { - this = TRegExpCharacterRange(re, start, end) and - re.charRange(_, start, lower_end, upper_start, end) - } - - /** Holds if this range goes from `lo` to `hi`, in effect is `lo-hi`. */ - predicate isRange(string lo, string hi) { - lo = re.getText().substring(start, lower_end) and - hi = re.getText().substring(upper_start, end) - } - - override RegExpTerm getChild(int i) { - i = 0 and - result.getRegex() = re and - result.getStart() = start and - result.getEnd() = lower_end - or - i = 1 and - result.getRegex() = re and - result.getStart() = upper_start and - result.getEnd() = end - } - - override string getPrimaryQLClass() { result = "RegExpCharacterRange" } -} - -/** - * A normal character in a regular expression, that is, a character - * without special meaning. This includes escaped characters. - * - * Examples: - * ``` - * t - * \t - * ``` - */ -class RegExpNormalChar extends RegExpTerm, TRegExpNormalChar { - RegExpNormalChar() { this = TRegExpNormalChar(re, start, end) } - - /** - * Holds if this constant represents a valid Unicode character (as opposed - * to a surrogate code point that does not correspond to a character by itself.) - */ - predicate isCharacter() { any() } - - /** Gets the string representation of the char matched by this term. */ - string getValue() { result = re.getText().substring(start, end) } - - override RegExpTerm getChild(int i) { none() } - - override string getPrimaryQLClass() { result = "RegExpNormalChar" } -} - -/** - * A constant regular expression term, that is, a regular expression - * term matching a single string. Currently, this will always be a single character. - * - * Example: - * - * ``` - * a - * ``` - */ -class RegExpConstant extends RegExpTerm { - string value; - - RegExpConstant() { - this = TRegExpNormalChar(re, start, end) and - not this instanceof RegExpCharacterClassEscape and - // exclude chars in qualifiers - // TODO: push this into regex library - not exists(int qstart, int qend | re.qualifiedPart(_, qstart, qend, _, _) | - qstart <= start and end <= qend - ) and - value = this.(RegExpNormalChar).getValue() - } - - /** - * Holds if this constant represents a valid Unicode character (as opposed - * to a surrogate code point that does not correspond to a character by itself.) - */ - predicate isCharacter() { any() } - - /** Gets the string matched by this constant term. */ - string getValue() { result = value } - - override RegExpTerm getChild(int i) { none() } - - override string getPrimaryQLClass() { result = "RegExpConstant" } -} - -/** - * A grouped regular expression. - * - * Examples: - * - * ``` - * (ECMA|Java) - * (?:ECMA|Java) - * (?['"]) - * ``` - */ -class RegExpGroup extends RegExpTerm, TRegExpGroup { - RegExpGroup() { this = TRegExpGroup(re, start, end) } - - /** - * Gets the index of this capture group within the enclosing regular - * expression literal. + * Example: * - * For example, in the regular expression `/((a?).)(?:b)/`, the - * group `((a?).)` has index 1, the group `(a?)` nested inside it - * has index 2, and the group `(?:b)` has no index, since it is - * not a capture group. + * ``` + * ((ECMA|Java)[sS]cript)* + * ``` */ - int getNumber() { result = re.getGroupNumber(start, end) } + class RegExpQuantifier extends RegExpTerm, TRegExpQuantifier { + int part_end; + boolean maybe_empty; + boolean may_repeat_forever; - /** Holds if this is a capture group. */ - predicate isCapture() { exists(this.getNumber()) } + RegExpQuantifier() { + this = TRegExpQuantifier(re, start, end) and + re.qualifiedPart(start, part_end, end, maybe_empty, may_repeat_forever) + } - /** Holds if this is a named capture group. */ - predicate isNamed() { exists(this.getName()) } - - /** Gets the name of this capture group, if any. */ - string getName() { result = re.getGroupName(start, end) } - - override RegExpTerm getChild(int i) { - result.getRegex() = re and - i = 0 and - re.groupContents(start, end, result.getStart(), result.getEnd()) - } - - override string getPrimaryQLClass() { result = "RegExpGroup" } -} - -/** - * A special character in a regular expression. - * - * Examples: - * ``` - * ^ - * $ - * . - * ``` - */ -class RegExpSpecialChar extends RegExpTerm, TRegExpSpecialChar { - string char; - - RegExpSpecialChar() { - this = TRegExpSpecialChar(re, start, end) and - re.specialCharacter(start, end, char) - } - - /** - * Holds if this constant represents a valid Unicode character (as opposed - * to a surrogate code point that does not correspond to a character by itself.) - */ - predicate isCharacter() { any() } - - /** Gets the char for this term. */ - string getChar() { result = char } - - override RegExpTerm getChild(int i) { none() } - - override string getPrimaryQLClass() { result = "RegExpSpecialChar" } -} - -/** - * A dot regular expression. - * - * Example: - * - * ``` - * . - * ``` - */ -class RegExpDot extends RegExpSpecialChar { - RegExpDot() { this.getChar() = "." } - - override string getPrimaryQLClass() { result = "RegExpDot" } -} - -/** - * A dollar assertion `$` or `\Z` matching the end of a line. - * - * Example: - * - * ``` - * $ - * ``` - */ -class RegExpDollar extends RegExpSpecialChar { - RegExpDollar() { this.getChar() = ["$", "\\Z"] } - - override string getPrimaryQLClass() { result = "RegExpDollar" } -} - -/** - * A caret assertion `^` or `\A` matching the beginning of a line. - * - * Example: - * - * ``` - * ^ - * ``` - */ -class RegExpCaret extends RegExpSpecialChar { - RegExpCaret() { this.getChar() = ["^", "\\A"] } - - override string getPrimaryQLClass() { result = "RegExpCaret" } -} - -/** - * A zero-width match, that is, either an empty group or an assertion. - * - * Examples: - * ``` - * () - * (?=\w) - * ``` - */ -class RegExpZeroWidthMatch extends RegExpGroup { - RegExpZeroWidthMatch() { re.zeroWidthMatch(start, end) } - - override RegExpTerm getChild(int i) { none() } - - override string getPrimaryQLClass() { result = "RegExpZeroWidthMatch" } -} - -/** - * A zero-width lookahead or lookbehind assertion. - * - * Examples: - * - * ``` - * (?=\w) - * (?!\n) - * (?<=\.) - * (? 0 and + result.getRegex() = re and + exists(int itemStart | itemStart = seqChildEnd(re, start, end, i - 1) | + result.getStart() = itemStart and + re.item(itemStart, result.getEnd()) + ) ) } -} - -/** - * A zero-width lookahead assertion. - * - * Examples: - * - * ``` - * (?=\w) - * (?!\n) - * ``` - */ -abstract class RegExpLookahead extends RegExpSubPattern { } - -/** - * A positive-lookahead assertion. - * - * Examples: - * - * ``` - * (?=\w) - * ``` - */ -class RegExpPositiveLookahead extends RegExpLookahead { - RegExpPositiveLookahead() { re.positiveLookaheadAssertionGroup(start, end) } - - override string getPrimaryQLClass() { result = "RegExpPositiveLookahead" } -} - -/** - * A negative-lookahead assertion. - * - * Examples: - * - * ``` - * (?!\n) - * ``` - */ -class RegExpNegativeLookahead extends RegExpLookahead { - RegExpNegativeLookahead() { re.negativeLookaheadAssertionGroup(start, end) } - - override string getPrimaryQLClass() { result = "RegExpNegativeLookahead" } -} - -/** - * A zero-width lookbehind assertion. - * - * Examples: - * - * ``` - * (?<=\.) - * (?` - * in a regular expression. - * - * Examples: - * - * ``` - * \1 - * (?P=quote) - * ``` - */ -class RegExpBackRef extends RegExpTerm, TRegExpBackRef { - RegExpBackRef() { this = TRegExpBackRef(re, start, end) } /** - * Gets the number of the capture group this back reference refers to, if any. + * An alternative term, that is, a term of the form `a|b`. + * + * Example: + * + * ``` + * ECMA|Java + * ``` */ - int getNumber() { result = re.getBackrefNumber(start, end) } + class RegExpAlt extends RegExpTerm, TRegExpAlt { + RegExpAlt() { this = TRegExpAlt(re, start, end) } + + override RegExpTerm getChild(int i) { + i = 0 and + result.getRegex() = re and + result.getStart() = start and + exists(int part_end | + re.alternationOption(start, end, start, part_end) and + result.getEnd() = part_end + ) + or + i > 0 and + result.getRegex() = re and + exists(int part_start | + part_start = this.getChild(i - 1).getEnd() + 1 // allow for the | + | + result.getStart() = part_start and + re.alternationOption(start, end, part_start, result.getEnd()) + ) + } + + override string getPrimaryQLClass() { result = "RegExpAlt" } + } + + class RegExpCharEscape = RegExpEscape; /** - * Gets the name of the capture group this back reference refers to, if any. + * An escaped regular expression term, that is, a regular expression + * term starting with a backslash, which is not a backreference. + * + * Example: + * + * ``` + * \. + * \w + * ``` */ - string getName() { result = re.getBackrefName(start, end) } + class RegExpEscape extends RegExpNormalChar { + RegExpEscape() { re.escapedCharacter(start, end) } - /** Gets the capture group this back reference refers to. */ - RegExpGroup getGroup() { - this.hasLiteralAndNumber(result.getLiteral(), result.getNumber()) or - this.hasLiteralAndName(result.getLiteral(), result.getName()) + /** + * Gets the name of the escaped; for example, `w` for `\w`. + * TODO: Handle named escapes. + */ + override string getValue() { + not this.isUnicode() and + this.isIdentityEscape() and + result = this.getUnescaped() + or + this.getUnescaped() = "n" and result = "\n" + or + this.getUnescaped() = "r" and result = "\r" + or + this.getUnescaped() = "t" and result = "\t" + or + this.getUnescaped() = "f" and result = 12.toUnicode() + or + this.getUnescaped() = "v" and result = 11.toUnicode() + or + this.isUnicode() and + result = this.getUnicode() + } + + /** Holds if this terms name is given by the part following the escape character. */ + predicate isIdentityEscape() { not this.getUnescaped() in ["n", "r", "t", "f"] } + + override string getPrimaryQLClass() { result = "RegExpEscape" } + + /** Gets the part of the term following the escape character. That is e.g. "w" if the term is "\w". */ + string getUnescaped() { result = this.getText().suffix(1) } + + /** + * Gets the text for this escape. That is e.g. "\w". + */ + private string getText() { result = re.getText().substring(start, end) } + + /** + * Holds if this is a unicode escape. + */ + private predicate isUnicode() { this.getText().prefix(2) = ["\\u", "\\U"] } + + /** + * Gets the unicode char for this escape. + * E.g. for `\u0061` this returns "a". + */ + private string getUnicode() { + exists(int codepoint | codepoint = sum(this.getHexValueFromUnicode(_)) | + result = codepoint.toUnicode() + ) + } + + /** + * Gets int value for the `index`th char in the hex number of the unicode escape. + * E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex). + */ + private int getHexValueFromUnicode(int index) { + this.isUnicode() and + exists(string hex, string char | hex = this.getText().suffix(2) | + char = hex.charAt(index) and + result = 16.pow(hex.length() - index - 1) * toHex(char) + ) + } } - /** Join-order helper for `getGroup`. */ - pragma[nomagic] - private predicate hasLiteralAndNumber(RegExpLiteral literal, int number) { - literal = this.getLiteral() and - number = this.getNumber() + /** + * Gets the hex number for the `hex` char. + */ + private int toHex(string hex) { + hex = [0 .. 9].toString() and + result = hex.toInt() + or + result = 10 and hex = ["a", "A"] + or + result = 11 and hex = ["b", "B"] + or + result = 12 and hex = ["c", "C"] + or + result = 13 and hex = ["d", "D"] + or + result = 14 and hex = ["e", "E"] + or + result = 15 and hex = ["f", "F"] } - /** Join-order helper for `getGroup`. */ - pragma[nomagic] - private predicate hasLiteralAndName(RegExpLiteral literal, string name) { - literal = this.getLiteral() and - name = this.getName() + /** + * A word boundary, that is, a regular expression term of the form `\b`. + */ + class RegExpWordBoundary extends RegExpSpecialChar { + RegExpWordBoundary() { this.getChar() = "\\b" } } - override RegExpTerm getChild(int i) { none() } + /** + * A character class escape in a regular expression. + * That is, an escaped character that denotes multiple characters. + * + * Examples: + * + * ``` + * \w + * \S + * ``` + */ + class RegExpCharacterClassEscape extends RegExpEscape { + RegExpCharacterClassEscape() { this.getValue() in ["d", "D", "s", "S", "w", "W"] } - override string getPrimaryQLClass() { result = "RegExpBackRef" } + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" } + } + + /** + * A character class in a regular expression. + * + * Examples: + * + * ``` + * [a-z_] + * [^<>&] + * ``` + */ + class RegExpCharacterClass extends RegExpTerm, TRegExpCharacterClass { + RegExpCharacterClass() { this = TRegExpCharacterClass(re, start, end) } + + /** Holds if this character class is inverted, matching the opposite of its content. */ + predicate isInverted() { re.getChar(start + 1) = "^" } + + /** Gets the `i`th char inside this charater class. */ + string getCharThing(int i) { result = re.getChar(i + start) } + + /** Holds if this character class can match anything. */ + predicate isUniversalClass() { + // [^] + this.isInverted() and not exists(this.getAChild()) + or + // [\w\W] and similar + not this.isInverted() and + exists(string cce1, string cce2 | + cce1 = this.getAChild().(RegExpCharacterClassEscape).getValue() and + cce2 = this.getAChild().(RegExpCharacterClassEscape).getValue() + | + cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase() + ) + } + + override RegExpTerm getChild(int i) { + i = 0 and + result.getRegex() = re and + exists(int itemStart, int itemEnd | + result.getStart() = itemStart and + re.char_set_start(start, itemStart) and + re.char_set_child(start, itemStart, itemEnd) and + result.getEnd() = itemEnd + ) + or + i > 0 and + result.getRegex() = re and + exists(int itemStart | itemStart = this.getChild(i - 1).getEnd() | + result.getStart() = itemStart and + re.char_set_child(start, itemStart, result.getEnd()) + ) + } + + override string getPrimaryQLClass() { result = "RegExpCharacterClass" } + } + + /** + * A character range in a character class in a regular expression. + * + * Example: + * + * ``` + * a-z + * ``` + */ + class RegExpCharacterRange extends RegExpTerm, TRegExpCharacterRange { + int lower_end; + int upper_start; + + RegExpCharacterRange() { + this = TRegExpCharacterRange(re, start, end) and + re.charRange(_, start, lower_end, upper_start, end) + } + + /** Holds if this range goes from `lo` to `hi`, in effect is `lo-hi`. */ + predicate isRange(string lo, string hi) { + lo = re.getText().substring(start, lower_end) and + hi = re.getText().substring(upper_start, end) + } + + override RegExpTerm getChild(int i) { + i = 0 and + result.getRegex() = re and + result.getStart() = start and + result.getEnd() = lower_end + or + i = 1 and + result.getRegex() = re and + result.getStart() = upper_start and + result.getEnd() = end + } + + override string getPrimaryQLClass() { result = "RegExpCharacterRange" } + } + + /** + * A normal character in a regular expression, that is, a character + * without special meaning. This includes escaped characters. + * + * Examples: + * ``` + * t + * \t + * ``` + */ + class RegExpNormalChar extends RegExpTerm, TRegExpNormalChar { + RegExpNormalChar() { this = TRegExpNormalChar(re, start, end) } + + /** + * Holds if this constant represents a valid Unicode character (as opposed + * to a surrogate code point that does not correspond to a character by itself.) + */ + predicate isCharacter() { any() } + + /** Gets the string representation of the char matched by this term. */ + string getValue() { result = re.getText().substring(start, end) } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpNormalChar" } + } + + /** + * A constant regular expression term, that is, a regular expression + * term matching a single string. Currently, this will always be a single character. + * + * Example: + * + * ``` + * a + * ``` + */ + class RegExpConstant extends RegExpTerm { + string value; + + RegExpConstant() { + this = TRegExpNormalChar(re, start, end) and + not this instanceof RegExpCharacterClassEscape and + // exclude chars in qualifiers + // TODO: push this into regex library + not exists(int qstart, int qend | re.qualifiedPart(_, qstart, qend, _, _) | + qstart <= start and end <= qend + ) and + value = this.(RegExpNormalChar).getValue() + } + + /** + * Holds if this constant represents a valid Unicode character (as opposed + * to a surrogate code point that does not correspond to a character by itself.) + */ + predicate isCharacter() { any() } + + /** Gets the string matched by this constant term. */ + string getValue() { result = value } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpConstant" } + } + + /** + * A grouped regular expression. + * + * Examples: + * + * ``` + * (ECMA|Java) + * (?:ECMA|Java) + * (?['"]) + * ``` + */ + class RegExpGroup extends RegExpTerm, TRegExpGroup { + RegExpGroup() { this = TRegExpGroup(re, start, end) } + + /** + * Gets the index of this capture group within the enclosing regular + * expression literal. + * + * For example, in the regular expression `/((a?).)(?:b)/`, the + * group `((a?).)` has index 1, the group `(a?)` nested inside it + * has index 2, and the group `(?:b)` has no index, since it is + * not a capture group. + */ + int getNumber() { result = re.getGroupNumber(start, end) } + + /** Holds if this is a capture group. */ + predicate isCapture() { exists(this.getNumber()) } + + /** Holds if this is a named capture group. */ + predicate isNamed() { exists(this.getName()) } + + /** Gets the name of this capture group, if any. */ + string getName() { result = re.getGroupName(start, end) } + + override RegExpTerm getChild(int i) { + result.getRegex() = re and + i = 0 and + re.groupContents(start, end, result.getStart(), result.getEnd()) + } + + override string getPrimaryQLClass() { result = "RegExpGroup" } + } + + /** + * A special character in a regular expression. + * + * Examples: + * ``` + * ^ + * $ + * . + * ``` + */ + class RegExpSpecialChar extends RegExpTerm, TRegExpSpecialChar { + string char; + + RegExpSpecialChar() { + this = TRegExpSpecialChar(re, start, end) and + re.specialCharacter(start, end, char) + } + + /** + * Holds if this constant represents a valid Unicode character (as opposed + * to a surrogate code point that does not correspond to a character by itself.) + */ + predicate isCharacter() { any() } + + /** Gets the char for this term. */ + string getChar() { result = char } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpSpecialChar" } + } + + /** + * A dot regular expression. + * + * Example: + * + * ``` + * . + * ``` + */ + class RegExpDot extends RegExpSpecialChar { + RegExpDot() { this.getChar() = "." } + + override string getPrimaryQLClass() { result = "RegExpDot" } + } + + /** + * A dollar assertion `$` or `\Z` matching the end of a line. + * + * Example: + * + * ``` + * $ + * ``` + */ + class RegExpDollar extends RegExpSpecialChar { + RegExpDollar() { this.getChar() = ["$", "\\Z"] } + + override string getPrimaryQLClass() { result = "RegExpDollar" } + } + + /** + * A caret assertion `^` or `\A` matching the beginning of a line. + * + * Example: + * + * ``` + * ^ + * ``` + */ + class RegExpCaret extends RegExpSpecialChar { + RegExpCaret() { this.getChar() = ["^", "\\A"] } + + override string getPrimaryQLClass() { result = "RegExpCaret" } + } + + /** + * A zero-width match, that is, either an empty group or an assertion. + * + * Examples: + * ``` + * () + * (?=\w) + * ``` + */ + class RegExpZeroWidthMatch extends RegExpGroup { + RegExpZeroWidthMatch() { re.zeroWidthMatch(start, end) } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpZeroWidthMatch" } + } + + /** + * A zero-width lookahead or lookbehind assertion. + * + * Examples: + * + * ``` + * (?=\w) + * (?!\n) + * (?<=\.) + * (?` + * in a regular expression. + * + * Examples: + * + * ``` + * \1 + * (?P=quote) + * ``` + */ + class RegExpBackRef extends RegExpTerm, TRegExpBackRef { + RegExpBackRef() { this = TRegExpBackRef(re, start, end) } + + /** + * Gets the number of the capture group this back reference refers to, if any. + */ + int getNumber() { result = re.getBackrefNumber(start, end) } + + /** + * Gets the name of the capture group this back reference refers to, if any. + */ + string getName() { result = re.getBackrefName(start, end) } + + /** Gets the capture group this back reference refers to. */ + RegExpGroup getGroup() { + this.hasLiteralAndNumber(result.getLiteral(), result.getNumber()) or + this.hasLiteralAndName(result.getLiteral(), result.getName()) + } + + /** Join-order helper for `getGroup`. */ + pragma[nomagic] + private predicate hasLiteralAndNumber(RegExpLiteral literal, int number) { + literal = this.getLiteral() and + number = this.getNumber() + } + + /** Join-order helper for `getGroup`. */ + pragma[nomagic] + private predicate hasLiteralAndName(RegExpLiteral literal, string name) { + literal = this.getLiteral() and + name = this.getName() + } + + override RegExpTerm getChild(int i) { none() } + + override string getPrimaryQLClass() { result = "RegExpBackRef" } + } } - -/** Gets the parse tree resulting from parsing `re`, if such has been constructed. */ -RegExpTerm getParsedRegExp(StrConst re) { result.getRegex() = re and result.isRootTerm() }