mirror of
https://github.com/github/codeql.git
synced 2026-01-17 08:24:46 +01:00
Since we calculate the end column by offset, we must believ that the end line is the same as the start line.
1135 lines
32 KiB
Plaintext
1135 lines
32 KiB
Plaintext
/** Provides a class hierarchy corresponding to a parse tree of regular expressions. */
|
|
|
|
import python
|
|
private import semmle.python.regex
|
|
private import codeql.regex.nfa.NfaUtils as NfaUtils
|
|
private import codeql.regex.RegexTreeView
|
|
// exporting as RegexTreeView, and in the top-level scope.
|
|
import Impl as RegexTreeView
|
|
import Impl
|
|
|
|
/** Gets the parse tree resulting from parsing `re`, if such has been constructed. */
|
|
RegExpTerm getParsedRegExp(StrConst re) { result.getRegex() = re and result.isRootTerm() }
|
|
|
|
/**
|
|
* An element containing a regular expression term, that is, either
|
|
* a string literal (parsed as a regular expression)
|
|
* or another regular expression term.
|
|
*
|
|
* For sequences and alternations, we require at least one child.
|
|
* Otherwise, we wish to represent the term differently.
|
|
* This avoids multiple representations of the same term.
|
|
*/
|
|
private newtype TRegExpParent =
|
|
/** A string literal used as a regular expression */
|
|
TRegExpLiteral(RegExp re) or
|
|
/** A quantified term */
|
|
TRegExpQuantifier(RegExp re, int start, int end) { re.qualifiedItem(start, end, _, _) } or
|
|
/** A sequence term */
|
|
TRegExpSequence(RegExp re, int start, int end) {
|
|
re.sequence(start, end) and
|
|
exists(seqChild(re, start, end, 1)) // if a sequence does not have more than one element, it should be treated as that element instead.
|
|
} or
|
|
/** An alternation term */
|
|
TRegExpAlt(RegExp re, int start, int end) {
|
|
re.alternation(start, end) and
|
|
exists(int part_end |
|
|
re.alternationOption(start, end, start, part_end) and
|
|
part_end < end
|
|
) // if an alternation does not have more than one element, it should be treated as that element instead.
|
|
} or
|
|
/** A character class term */
|
|
TRegExpCharacterClass(RegExp re, int start, int end) { re.charSet(start, end) } or
|
|
/** A character range term */
|
|
TRegExpCharacterRange(RegExp re, int start, int end) { re.charRange(_, start, _, _, end) } or
|
|
/** A group term */
|
|
TRegExpGroup(RegExp re, int start, int end) { re.group(start, end) } or
|
|
/** A special character */
|
|
TRegExpSpecialChar(RegExp re, int start, int end) { re.specialCharacter(start, end, _) } or
|
|
/** A normal character */
|
|
TRegExpNormalChar(RegExp re, int start, int end) {
|
|
re.normalCharacterSequence(start, end)
|
|
or
|
|
re.escapedCharacter(start, end) and
|
|
not re.specialCharacter(start, end, _)
|
|
} or
|
|
/** A back reference */
|
|
TRegExpBackRef(RegExp re, int start, int end) { re.backreference(start, end) }
|
|
|
|
pragma[nomagic]
|
|
private int seqChildEnd(RegExp re, int start, int end, int i) {
|
|
result = seqChild(re, start, end, i).getEnd()
|
|
}
|
|
|
|
// moved out so we can use it in the charpred
|
|
private RegExpTerm seqChild(RegExp re, int start, int end, int i) {
|
|
re.sequence(start, end) and
|
|
(
|
|
i = 0 and
|
|
result.getRegex() = re and
|
|
result.getStart() = start and
|
|
exists(int itemEnd |
|
|
re.item(start, itemEnd) and
|
|
result.getEnd() = itemEnd
|
|
)
|
|
or
|
|
i > 0 and
|
|
result.getRegex() = re and
|
|
exists(int itemStart | itemStart = seqChildEnd(re, start, end, i - 1) |
|
|
result.getStart() = itemStart and
|
|
re.item(itemStart, result.getEnd())
|
|
)
|
|
)
|
|
}
|
|
|
|
/** An implementation that satisfies the RegexTreeView signature. */
|
|
module Impl implements RegexTreeViewSig {
|
|
/**
|
|
* An element containing a regular expression term, that is, either
|
|
* a string literal (parsed as a regular expression)
|
|
* or another regular expression term.
|
|
*/
|
|
class RegExpParent extends TRegExpParent {
|
|
/** Gets a textual representation of this element. */
|
|
string toString() { result = "RegExpParent" }
|
|
|
|
/** Gets the `i`th child term. */
|
|
abstract RegExpTerm getChild(int i);
|
|
|
|
/** Gets a child term . */
|
|
RegExpTerm getAChild() { result = this.getChild(_) }
|
|
|
|
/** Gets the number of child terms. */
|
|
int getNumChild() { result = count(this.getAChild()) }
|
|
|
|
/** Gets the last child term of this element. */
|
|
RegExpTerm getLastChild() { result = this.getChild(this.getNumChild() - 1) }
|
|
|
|
/** Gets the associated regex. */
|
|
abstract RegExp getRegex();
|
|
}
|
|
|
|
/** A string literal used as a regular expression */
|
|
class RegExpLiteral extends TRegExpLiteral, RegExpParent {
|
|
RegExp re;
|
|
|
|
RegExpLiteral() { this = TRegExpLiteral(re) }
|
|
|
|
override RegExpTerm getChild(int i) { i = 0 and result.getRegex() = re and result.isRootTerm() }
|
|
|
|
/** Holds if dot, `.`, matches all characters, including newlines. */
|
|
predicate isDotAll() { re.getAMode() = "DOTALL" }
|
|
|
|
/** Holds if this regex matching is case-insensitive for this regex. */
|
|
predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" }
|
|
|
|
/** Get a string representing all modes for this regex. */
|
|
string getFlags() { result = concat(string mode | mode = re.getAMode() | mode, " | ") }
|
|
|
|
override RegExp getRegex() { result = re }
|
|
|
|
/** Gets the primary QL class for this regex. */
|
|
string getPrimaryQLClass() { result = "RegExpLiteral" }
|
|
}
|
|
|
|
/**
|
|
* A regular expression term, that is, a syntactic part of a regular expression.
|
|
*/
|
|
class RegExpTerm extends RegExpParent {
|
|
RegExp re;
|
|
int start;
|
|
int end;
|
|
|
|
RegExpTerm() {
|
|
this = TRegExpAlt(re, start, end)
|
|
or
|
|
this = TRegExpBackRef(re, start, end)
|
|
or
|
|
this = TRegExpCharacterClass(re, start, end)
|
|
or
|
|
this = TRegExpCharacterRange(re, start, end)
|
|
or
|
|
this = TRegExpNormalChar(re, start, end)
|
|
or
|
|
this = TRegExpGroup(re, start, end)
|
|
or
|
|
this = TRegExpQuantifier(re, start, end)
|
|
or
|
|
this = TRegExpSequence(re, start, end)
|
|
or
|
|
this = TRegExpSpecialChar(re, start, end)
|
|
}
|
|
|
|
/**
|
|
* Gets the outermost term of this regular expression.
|
|
*/
|
|
RegExpTerm getRootTerm() {
|
|
this.isRootTerm() and result = this
|
|
or
|
|
result = this.getParent().(RegExpTerm).getRootTerm()
|
|
}
|
|
|
|
/**
|
|
* Holds if this term is part of a string literal
|
|
* that is interpreted as a regular expression.
|
|
*/
|
|
predicate isUsedAsRegExp() { any() }
|
|
|
|
/**
|
|
* Holds if this is the root term of a regular expression.
|
|
*/
|
|
predicate isRootTerm() { start = 0 and end = re.getText().length() }
|
|
|
|
override RegExpTerm getChild(int i) {
|
|
result = this.(RegExpAlt).getChild(i)
|
|
or
|
|
result = this.(RegExpBackRef).getChild(i)
|
|
or
|
|
result = this.(RegExpCharacterClass).getChild(i)
|
|
or
|
|
result = this.(RegExpCharacterRange).getChild(i)
|
|
or
|
|
result = this.(RegExpNormalChar).getChild(i)
|
|
or
|
|
result = this.(RegExpGroup).getChild(i)
|
|
or
|
|
result = this.(RegExpQuantifier).getChild(i)
|
|
or
|
|
result = this.(RegExpSequence).getChild(i)
|
|
or
|
|
result = this.(RegExpSpecialChar).getChild(i)
|
|
}
|
|
|
|
/**
|
|
* Gets the parent term of this regular expression term, or the
|
|
* regular expression literal if this is the root term.
|
|
*/
|
|
RegExpParent getParent() { result.getAChild() = this }
|
|
|
|
override RegExp getRegex() { result = re }
|
|
|
|
/** Gets the offset at which this term starts. */
|
|
int getStart() { result = start }
|
|
|
|
/** Gets the offset at which this term ends. */
|
|
int getEnd() { result = end }
|
|
|
|
override string toString() { result = re.getText().substring(start, end) }
|
|
|
|
/**
|
|
* Gets the location of the surrounding regex, as locations inside the regex do not exist.
|
|
* To get location information corresponding to the term inside the regex,
|
|
* use `hasLocationInfo`.
|
|
*/
|
|
Location getLocation() { result = re.getLocation() }
|
|
|
|
/** Gets the accumulated length of string parts with lower index than `index`, if any. */
|
|
private int getPartOffset(int index) {
|
|
index = 0 and result = 0
|
|
or
|
|
index > 0 and
|
|
exists(int previousOffset | previousOffset = this.getPartOffset(index - 1) |
|
|
result =
|
|
previousOffset + re.(StrConst).getImplicitlyConcatenatedPart(index - 1).getContentLength()
|
|
)
|
|
}
|
|
|
|
/**
|
|
* Gets the `StringPart` in which this `RegExpTerm` resides, if any.
|
|
* `localOffset` will be the offset of this `RegExpTerm` inside `result`.
|
|
*/
|
|
StringPart getPart(int localOffset) {
|
|
exists(int index, int prefixLength | index = max(int i | this.getPartOffset(i) <= start) |
|
|
result = re.(StrConst).getImplicitlyConcatenatedPart(index) and
|
|
result.contextSize(prefixLength, _) and
|
|
// Example:
|
|
// re.compile('...' r"""...this..""")
|
|
// - `start` is the offset from `(` to `this` as counted after concatenating all parts.
|
|
// - we subtract the length of the previous `StringPart`s, `'...'`, to know how far into this `StringPart` we go.
|
|
// - as the prefix 'r"""' is part of the `StringPart`, `this` is found that much further in.
|
|
localOffset = start - this.getPartOffset(index) + prefixLength
|
|
)
|
|
}
|
|
|
|
/** Holds if this term is found at the specified location offsets. */
|
|
predicate hasLocationInfo(
|
|
string filepath, int startline, int startcolumn, int endline, int endcolumn
|
|
) {
|
|
not exists(this.getPart(_)) and
|
|
exists(int re_start, int prefix_len | prefix_len = re.getPrefix().length() |
|
|
re.getLocation().hasLocationInfo(filepath, startline, re_start, _, _) and
|
|
startcolumn = re_start + start + prefix_len and
|
|
endline = startline and
|
|
endcolumn = re_start + end + prefix_len - 1
|
|
/* inclusive vs exclusive */
|
|
)
|
|
or
|
|
exists(StringPart part, int localOffset, int partStartColumn |
|
|
part = this.getPart(localOffset)
|
|
|
|
|
part.getLocation().hasLocationInfo(filepath, startline, partStartColumn, _, _) and
|
|
startcolumn = partStartColumn + localOffset and
|
|
endline = startline and
|
|
endcolumn = (end - start) + startcolumn
|
|
)
|
|
}
|
|
|
|
/** Gets the file in which this term is found. */
|
|
File getFile() { result = this.getLocation().getFile() }
|
|
|
|
/** Gets the raw source text of this term. */
|
|
string getRawValue() { result = this.toString() }
|
|
|
|
/** Gets the string literal in which this term is found. */
|
|
RegExpLiteral getLiteral() { result = TRegExpLiteral(re) }
|
|
|
|
/** Gets the regular expression term that is matched (textually) before this one, if any. */
|
|
RegExpTerm getPredecessor() {
|
|
exists(RegExpTerm parent | parent = this.getParent() |
|
|
result = parent.(RegExpSequence).previousElement(this)
|
|
or
|
|
not exists(parent.(RegExpSequence).previousElement(this)) and
|
|
not parent instanceof RegExpSubPattern and
|
|
result = parent.getPredecessor()
|
|
)
|
|
}
|
|
|
|
/** Gets the regular expression term that is matched (textually) after this one, if any. */
|
|
RegExpTerm getSuccessor() {
|
|
exists(RegExpTerm parent | parent = this.getParent() |
|
|
result = parent.(RegExpSequence).nextElement(this)
|
|
or
|
|
not exists(parent.(RegExpSequence).nextElement(this)) and
|
|
not parent instanceof RegExpSubPattern and
|
|
result = parent.getSuccessor()
|
|
)
|
|
}
|
|
|
|
/** Gets the primary QL class for this term. */
|
|
string getPrimaryQLClass() { result = "RegExpTerm" }
|
|
}
|
|
|
|
/**
|
|
* A quantified regular expression term.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* ((ECMA|Java)[sS]cript)*
|
|
* ```
|
|
*/
|
|
class RegExpQuantifier extends RegExpTerm, TRegExpQuantifier {
|
|
int part_end;
|
|
boolean may_repeat_forever;
|
|
|
|
RegExpQuantifier() {
|
|
this = TRegExpQuantifier(re, start, end) and
|
|
re.qualifiedPart(start, part_end, end, _, may_repeat_forever)
|
|
}
|
|
|
|
override RegExpTerm getChild(int i) {
|
|
i = 0 and
|
|
result.getRegex() = re and
|
|
result.getStart() = start and
|
|
result.getEnd() = part_end
|
|
}
|
|
|
|
/** Hols if this term may match an unlimited number of times. */
|
|
predicate mayRepeatForever() { may_repeat_forever = true }
|
|
|
|
/** Gets the qualifier for this term. That is e.g "?" for "a?". */
|
|
string getQualifier() { result = re.getText().substring(part_end, end) }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpQuantifier" }
|
|
}
|
|
|
|
/**
|
|
* A regular expression term that permits unlimited repetitions.
|
|
*/
|
|
class InfiniteRepetitionQuantifier extends RegExpQuantifier {
|
|
InfiniteRepetitionQuantifier() { this.mayRepeatForever() }
|
|
}
|
|
|
|
/**
|
|
* A star-quantified term.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* \w*
|
|
* ```
|
|
*/
|
|
class RegExpStar extends InfiniteRepetitionQuantifier {
|
|
RegExpStar() { this.getQualifier().charAt(0) = "*" }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpStar" }
|
|
}
|
|
|
|
/**
|
|
* A plus-quantified term.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* \w+
|
|
* ```
|
|
*/
|
|
class RegExpPlus extends InfiniteRepetitionQuantifier {
|
|
RegExpPlus() { this.getQualifier().charAt(0) = "+" }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpPlus" }
|
|
}
|
|
|
|
/**
|
|
* An optional term.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* ;?
|
|
* ```
|
|
*/
|
|
class RegExpOpt extends RegExpQuantifier {
|
|
RegExpOpt() { this.getQualifier().charAt(0) = "?" }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpOpt" }
|
|
}
|
|
|
|
/**
|
|
* A range-quantified term
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* \w{2,4}
|
|
* \w{2,}
|
|
* \w{2}
|
|
* ```
|
|
*/
|
|
class RegExpRange extends RegExpQuantifier {
|
|
string upper;
|
|
string lower;
|
|
|
|
RegExpRange() { re.multiples(part_end, end, lower, upper) }
|
|
|
|
/** Gets the string defining the upper bound of this range, if any. */
|
|
string getUpper() { result = upper }
|
|
|
|
/** Gets the string defining the lower bound of this range, if any. */
|
|
string getLower() { result = lower }
|
|
|
|
/**
|
|
* Gets the upper bound of the range, if any.
|
|
*
|
|
* If there is no upper bound, any number of repetitions is allowed.
|
|
* For a term of the form `r{lo}`, both the lower and the upper bound
|
|
* are `lo`.
|
|
*/
|
|
int getUpperBound() { result = this.getUpper().toInt() }
|
|
|
|
/** Gets the lower bound of the range. */
|
|
int getLowerBound() { result = this.getLower().toInt() }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpRange" }
|
|
}
|
|
|
|
/**
|
|
* A sequence term.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* (ECMA|Java)Script
|
|
* ```
|
|
*
|
|
* This is a sequence with the elements `(ECMA|Java)` and `Script`.
|
|
*/
|
|
class RegExpSequence extends RegExpTerm, TRegExpSequence {
|
|
RegExpSequence() { this = TRegExpSequence(re, start, end) }
|
|
|
|
override RegExpTerm getChild(int i) { result = seqChild(re, start, end, i) }
|
|
|
|
/** Gets the element preceding `element` in this sequence. */
|
|
RegExpTerm previousElement(RegExpTerm element) { element = this.nextElement(result) }
|
|
|
|
/** Gets the element following `element` in this sequence. */
|
|
RegExpTerm nextElement(RegExpTerm element) {
|
|
exists(int i |
|
|
element = this.getChild(i) and
|
|
result = this.getChild(i + 1)
|
|
)
|
|
}
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpSequence" }
|
|
}
|
|
|
|
/**
|
|
* An alternative term, that is, a term of the form `a|b`.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* ECMA|Java
|
|
* ```
|
|
*/
|
|
class RegExpAlt extends RegExpTerm, TRegExpAlt {
|
|
RegExpAlt() { this = TRegExpAlt(re, start, end) }
|
|
|
|
override RegExpTerm getChild(int i) {
|
|
i = 0 and
|
|
result.getRegex() = re and
|
|
result.getStart() = start and
|
|
exists(int part_end |
|
|
re.alternationOption(start, end, start, part_end) and
|
|
result.getEnd() = part_end
|
|
)
|
|
or
|
|
i > 0 and
|
|
result.getRegex() = re and
|
|
exists(int part_start |
|
|
part_start = this.getChild(i - 1).getEnd() + 1 // allow for the |
|
|
|
|
|
result.getStart() = part_start and
|
|
re.alternationOption(start, end, part_start, result.getEnd())
|
|
)
|
|
}
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpAlt" }
|
|
}
|
|
|
|
/**
|
|
* A character escape in a regular expression.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* \.
|
|
* ```
|
|
*/
|
|
class RegExpCharEscape = RegExpEscape;
|
|
|
|
private import codeql.util.Numbers as Numbers
|
|
|
|
/**
|
|
* An escaped regular expression term, that is, a regular expression
|
|
* term starting with a backslash, which is not a backreference.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* \.
|
|
* \w
|
|
* ```
|
|
*/
|
|
class RegExpEscape extends RegExpNormalChar {
|
|
RegExpEscape() { re.escapedCharacter(start, end) }
|
|
|
|
/**
|
|
* Gets the name of the escaped; for example, `w` for `\w`.
|
|
* TODO: Handle named escapes.
|
|
*/
|
|
override string getValue() {
|
|
not this.isUnicode() and
|
|
this.isIdentityEscape() and
|
|
result = this.getUnescaped()
|
|
or
|
|
this.getUnescaped() = "n" and result = "\n"
|
|
or
|
|
this.getUnescaped() = "r" and result = "\r"
|
|
or
|
|
this.getUnescaped() = "t" and result = "\t"
|
|
or
|
|
this.getUnescaped() = "f" and result = 12.toUnicode()
|
|
or
|
|
this.getUnescaped() = "v" and result = 11.toUnicode()
|
|
or
|
|
this.isUnicode() and
|
|
result = this.getUnicode()
|
|
}
|
|
|
|
/** Holds if this terms name is given by the part following the escape character. */
|
|
predicate isIdentityEscape() { not this.getUnescaped() in ["n", "r", "t", "f"] }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpEscape" }
|
|
|
|
/** Gets the part of the term following the escape character. That is e.g. "w" if the term is "\w". */
|
|
string getUnescaped() { result = this.getText().suffix(1) }
|
|
|
|
/**
|
|
* Gets the text for this escape. That is e.g. "\w".
|
|
*/
|
|
private string getText() { result = re.getText().substring(start, end) }
|
|
|
|
/**
|
|
* Holds if this is a unicode escape.
|
|
*/
|
|
private predicate isUnicode() { this.getText().prefix(2) = ["\\u", "\\U"] }
|
|
|
|
/**
|
|
* Gets the unicode char for this escape.
|
|
* E.g. for `\u0061` this returns "a".
|
|
*/
|
|
private string getUnicode() {
|
|
result = Numbers::parseHexInt(this.getText().suffix(2)).toUnicode()
|
|
}
|
|
}
|
|
|
|
/**
|
|
* A word boundary, that is, a regular expression term of the form `\b`.
|
|
*/
|
|
class RegExpWordBoundary extends RegExpSpecialChar {
|
|
RegExpWordBoundary() { this.getChar() = "\\b" }
|
|
}
|
|
|
|
/**
|
|
* A non-word boundary, that is, a regular expression term of the form `\B`.
|
|
*/
|
|
class RegExpNonWordBoundary extends RegExpSpecialChar {
|
|
RegExpNonWordBoundary() { this.getChar() = "\\B" }
|
|
}
|
|
|
|
/**
|
|
* A character class escape in a regular expression.
|
|
* That is, an escaped character that denotes multiple characters.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* \w
|
|
* \S
|
|
* ```
|
|
*/
|
|
class RegExpCharacterClassEscape extends RegExpEscape {
|
|
RegExpCharacterClassEscape() { this.getValue() in ["d", "D", "s", "S", "w", "W"] }
|
|
|
|
override RegExpTerm getChild(int i) { none() }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" }
|
|
}
|
|
|
|
/**
|
|
* A character class in a regular expression.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* [a-z_]
|
|
* [^<>&]
|
|
* ```
|
|
*/
|
|
class RegExpCharacterClass extends RegExpTerm, TRegExpCharacterClass {
|
|
RegExpCharacterClass() { this = TRegExpCharacterClass(re, start, end) }
|
|
|
|
/** Holds if this character class is inverted, matching the opposite of its content. */
|
|
predicate isInverted() { re.getChar(start + 1) = "^" }
|
|
|
|
/** Gets the `i`th char inside this charater class. */
|
|
string getCharThing(int i) { result = re.getChar(i + start) }
|
|
|
|
/** Holds if this character class can match anything. */
|
|
predicate isUniversalClass() {
|
|
// [^]
|
|
this.isInverted() and not exists(this.getAChild())
|
|
or
|
|
// [\w\W] and similar
|
|
not this.isInverted() and
|
|
exists(string cce1, string cce2 |
|
|
cce1 = this.getAChild().(RegExpCharacterClassEscape).getValue() and
|
|
cce2 = this.getAChild().(RegExpCharacterClassEscape).getValue()
|
|
|
|
|
cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase()
|
|
)
|
|
}
|
|
|
|
override RegExpTerm getChild(int i) {
|
|
i = 0 and
|
|
result.getRegex() = re and
|
|
exists(int itemStart, int itemEnd |
|
|
result.getStart() = itemStart and
|
|
re.char_set_start(start, itemStart) and
|
|
re.char_set_child(start, itemStart, itemEnd) and
|
|
result.getEnd() = itemEnd
|
|
)
|
|
or
|
|
i > 0 and
|
|
result.getRegex() = re and
|
|
exists(int itemStart | itemStart = this.getChild(i - 1).getEnd() |
|
|
result.getStart() = itemStart and
|
|
re.char_set_child(start, itemStart, result.getEnd())
|
|
)
|
|
}
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpCharacterClass" }
|
|
}
|
|
|
|
/**
|
|
* A character range in a character class in a regular expression.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* a-z
|
|
* ```
|
|
*/
|
|
class RegExpCharacterRange extends RegExpTerm, TRegExpCharacterRange {
|
|
int lower_end;
|
|
int upper_start;
|
|
|
|
RegExpCharacterRange() {
|
|
this = TRegExpCharacterRange(re, start, end) and
|
|
re.charRange(_, start, lower_end, upper_start, end)
|
|
}
|
|
|
|
/** Holds if this range goes from `lo` to `hi`, in effect is `lo-hi`. */
|
|
predicate isRange(string lo, string hi) {
|
|
lo = re.getText().substring(start, lower_end) and
|
|
hi = re.getText().substring(upper_start, end)
|
|
}
|
|
|
|
override RegExpTerm getChild(int i) {
|
|
i = 0 and
|
|
result.getRegex() = re and
|
|
result.getStart() = start and
|
|
result.getEnd() = lower_end
|
|
or
|
|
i = 1 and
|
|
result.getRegex() = re and
|
|
result.getStart() = upper_start and
|
|
result.getEnd() = end
|
|
}
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpCharacterRange" }
|
|
}
|
|
|
|
/**
|
|
* A normal character in a regular expression, that is, a character
|
|
* without special meaning. This includes escaped characters.
|
|
*
|
|
* Examples:
|
|
* ```
|
|
* t
|
|
* \t
|
|
* ```
|
|
*/
|
|
additional class RegExpNormalChar extends RegExpTerm, TRegExpNormalChar {
|
|
RegExpNormalChar() { this = TRegExpNormalChar(re, start, end) }
|
|
|
|
/**
|
|
* Holds if this constant represents a valid Unicode character (as opposed
|
|
* to a surrogate code point that does not correspond to a character by itself.)
|
|
*/
|
|
predicate isCharacter() { any() }
|
|
|
|
/** Gets the string representation of the char matched by this term. */
|
|
string getValue() { result = re.getText().substring(start, end) }
|
|
|
|
override RegExpTerm getChild(int i) { none() }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpNormalChar" }
|
|
}
|
|
|
|
/**
|
|
* A constant regular expression term, that is, a regular expression
|
|
* term matching a single string. Currently, this will always be a single character.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* a
|
|
* ```
|
|
*/
|
|
class RegExpConstant extends RegExpTerm {
|
|
string value;
|
|
|
|
RegExpConstant() {
|
|
this = TRegExpNormalChar(re, start, end) and
|
|
not this instanceof RegExpCharacterClassEscape and
|
|
// exclude chars in qualifiers
|
|
// TODO: push this into regex library
|
|
not exists(int qstart, int qend | re.qualifiedPart(_, qstart, qend, _, _) |
|
|
qstart <= start and end <= qend
|
|
) and
|
|
value = this.(RegExpNormalChar).getValue()
|
|
}
|
|
|
|
/**
|
|
* Holds if this constant represents a valid Unicode character (as opposed
|
|
* to a surrogate code point that does not correspond to a character by itself.)
|
|
*/
|
|
predicate isCharacter() { any() }
|
|
|
|
/** Gets the string matched by this constant term. */
|
|
string getValue() { result = value }
|
|
|
|
override RegExpTerm getChild(int i) { none() }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpConstant" }
|
|
}
|
|
|
|
/**
|
|
* A grouped regular expression.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (ECMA|Java)
|
|
* (?:ECMA|Java)
|
|
* (?<quote>['"])
|
|
* ```
|
|
*/
|
|
class RegExpGroup extends RegExpTerm, TRegExpGroup {
|
|
RegExpGroup() { this = TRegExpGroup(re, start, end) }
|
|
|
|
/**
|
|
* Gets the index of this capture group within the enclosing regular
|
|
* expression literal.
|
|
*
|
|
* For example, in the regular expression `/((a?).)(?:b)/`, the
|
|
* group `((a?).)` has index 1, the group `(a?)` nested inside it
|
|
* has index 2, and the group `(?:b)` has no index, since it is
|
|
* not a capture group.
|
|
*/
|
|
int getNumber() { result = re.getGroupNumber(start, end) }
|
|
|
|
/** Holds if this is a capture group. */
|
|
predicate isCapture() { exists(this.getNumber()) }
|
|
|
|
/** Holds if this is a named capture group. */
|
|
predicate isNamed() { exists(this.getName()) }
|
|
|
|
/** Gets the name of this capture group, if any. */
|
|
string getName() { result = re.getGroupName(start, end) }
|
|
|
|
override RegExpTerm getChild(int i) {
|
|
result.getRegex() = re and
|
|
i = 0 and
|
|
re.groupContents(start, end, result.getStart(), result.getEnd())
|
|
}
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpGroup" }
|
|
}
|
|
|
|
/**
|
|
* A special character in a regular expression.
|
|
*
|
|
* Examples:
|
|
* ```
|
|
* ^
|
|
* $
|
|
* .
|
|
* ```
|
|
*/
|
|
additional class RegExpSpecialChar extends RegExpTerm, TRegExpSpecialChar {
|
|
string char;
|
|
|
|
RegExpSpecialChar() {
|
|
this = TRegExpSpecialChar(re, start, end) and
|
|
re.specialCharacter(start, end, char)
|
|
}
|
|
|
|
/**
|
|
* Holds if this constant represents a valid Unicode character (as opposed
|
|
* to a surrogate code point that does not correspond to a character by itself.)
|
|
*/
|
|
predicate isCharacter() { any() }
|
|
|
|
/** Gets the char for this term. */
|
|
string getChar() { result = char }
|
|
|
|
override RegExpTerm getChild(int i) { none() }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpSpecialChar" }
|
|
}
|
|
|
|
/**
|
|
* A dot regular expression.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* .
|
|
* ```
|
|
*/
|
|
class RegExpDot extends RegExpSpecialChar {
|
|
RegExpDot() { this.getChar() = "." }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpDot" }
|
|
}
|
|
|
|
/**
|
|
* A term that matches a specific position between characters in the string.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* \A
|
|
* ```
|
|
*/
|
|
class RegExpAnchor extends RegExpSpecialChar {
|
|
RegExpAnchor() { this.getChar() = ["\\A", "^", "$", "\\Z"] }
|
|
}
|
|
|
|
/**
|
|
* A dollar assertion `$` or `\Z` matching the end of a line.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* $
|
|
* ```
|
|
*/
|
|
class RegExpDollar extends RegExpAnchor {
|
|
RegExpDollar() { this.getChar() = ["$", "\\Z"] }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpDollar" }
|
|
}
|
|
|
|
/**
|
|
* A caret assertion `^` or `\A` matching the beginning of a line.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* ^
|
|
* ```
|
|
*/
|
|
class RegExpCaret extends RegExpAnchor {
|
|
RegExpCaret() { this.getChar() = ["^", "\\A"] }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpCaret" }
|
|
}
|
|
|
|
/**
|
|
* A zero-width match, that is, either an empty group or an assertion.
|
|
*
|
|
* Examples:
|
|
* ```
|
|
* ()
|
|
* (?=\w)
|
|
* ```
|
|
*/
|
|
additional class RegExpZeroWidthMatch extends RegExpGroup {
|
|
RegExpZeroWidthMatch() { re.zeroWidthMatch(start, end) }
|
|
|
|
override RegExpTerm getChild(int i) { none() }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpZeroWidthMatch" }
|
|
}
|
|
|
|
/**
|
|
* A zero-width lookahead or lookbehind assertion.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (?=\w)
|
|
* (?!\n)
|
|
* (?<=\.)
|
|
* (?<!\\)
|
|
* ```
|
|
*/
|
|
class RegExpSubPattern extends RegExpZeroWidthMatch {
|
|
RegExpSubPattern() { not re.emptyGroup(start, end) }
|
|
|
|
/** Gets the lookahead term. */
|
|
RegExpTerm getOperand() {
|
|
exists(int in_start, int in_end | re.groupContents(start, end, in_start, in_end) |
|
|
result.getRegex() = re and
|
|
result.getStart() = in_start and
|
|
result.getEnd() = in_end
|
|
)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* A zero-width lookahead assertion.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (?=\w)
|
|
* (?!\n)
|
|
* ```
|
|
*/
|
|
abstract class RegExpLookahead extends RegExpSubPattern { }
|
|
|
|
/**
|
|
* A positive-lookahead assertion.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (?=\w)
|
|
* ```
|
|
*/
|
|
class RegExpPositiveLookahead extends RegExpLookahead {
|
|
RegExpPositiveLookahead() { re.positiveLookaheadAssertionGroup(start, end) }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpPositiveLookahead" }
|
|
}
|
|
|
|
/**
|
|
* A negative-lookahead assertion.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (?!\n)
|
|
* ```
|
|
*/
|
|
additional class RegExpNegativeLookahead extends RegExpLookahead {
|
|
RegExpNegativeLookahead() { re.negativeLookaheadAssertionGroup(start, end) }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpNegativeLookahead" }
|
|
}
|
|
|
|
/**
|
|
* A zero-width lookbehind assertion.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (?<=\.)
|
|
* (?<!\\)
|
|
* ```
|
|
*/
|
|
abstract class RegExpLookbehind extends RegExpSubPattern { }
|
|
|
|
/**
|
|
* A positive-lookbehind assertion.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (?<=\.)
|
|
* ```
|
|
*/
|
|
class RegExpPositiveLookbehind extends RegExpLookbehind {
|
|
RegExpPositiveLookbehind() { re.positiveLookbehindAssertionGroup(start, end) }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpPositiveLookbehind" }
|
|
}
|
|
|
|
/**
|
|
* A negative-lookbehind assertion.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (?<!\\)
|
|
* ```
|
|
*/
|
|
additional class RegExpNegativeLookbehind extends RegExpLookbehind {
|
|
RegExpNegativeLookbehind() { re.negativeLookbehindAssertionGroup(start, end) }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpNegativeLookbehind" }
|
|
}
|
|
|
|
/**
|
|
* A back reference, that is, a term of the form `\i` or `\k<name>`
|
|
* in a regular expression.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* \1
|
|
* (?P=quote)
|
|
* ```
|
|
*/
|
|
class RegExpBackRef extends RegExpTerm, TRegExpBackRef {
|
|
RegExpBackRef() { this = TRegExpBackRef(re, start, end) }
|
|
|
|
/**
|
|
* Gets the number of the capture group this back reference refers to, if any.
|
|
*/
|
|
int getNumber() { result = re.getBackrefNumber(start, end) }
|
|
|
|
/**
|
|
* Gets the name of the capture group this back reference refers to, if any.
|
|
*/
|
|
string getName() { result = re.getBackrefName(start, end) }
|
|
|
|
/** Gets the capture group this back reference refers to. */
|
|
RegExpGroup getGroup() {
|
|
this.hasLiteralAndNumber(result.getLiteral(), result.getNumber()) or
|
|
this.hasLiteralAndName(result.getLiteral(), result.getName())
|
|
}
|
|
|
|
/** Join-order helper for `getGroup`. */
|
|
pragma[nomagic]
|
|
private predicate hasLiteralAndNumber(RegExpLiteral literal, int number) {
|
|
literal = this.getLiteral() and
|
|
number = this.getNumber()
|
|
}
|
|
|
|
/** Join-order helper for `getGroup`. */
|
|
pragma[nomagic]
|
|
private predicate hasLiteralAndName(RegExpLiteral literal, string name) {
|
|
literal = this.getLiteral() and
|
|
name = this.getName()
|
|
}
|
|
|
|
override RegExpTerm getChild(int i) { none() }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpBackRef" }
|
|
}
|
|
|
|
class Top = RegExpParent;
|
|
|
|
/**
|
|
* Holds if `term` is an escape class representing e.g. `\d`.
|
|
* `clazz` is which character class it represents, e.g. "d" for `\d`.
|
|
*/
|
|
predicate isEscapeClass(RegExpTerm term, string clazz) {
|
|
exists(RegExpCharacterClassEscape escape | term = escape | escape.getValue() = clazz)
|
|
}
|
|
|
|
/**
|
|
* Holds if `term` is a possessive quantifier.
|
|
* As python's regexes do not support possessive quantifiers, this never holds, but is used by the shared library.
|
|
*/
|
|
predicate isPossessive(RegExpQuantifier term) { none() }
|
|
|
|
/**
|
|
* Holds if the regex that `term` is part of is used in a way that ignores any leading prefix of the input it's matched against.
|
|
* Not yet implemented for Python.
|
|
*/
|
|
predicate matchesAnyPrefix(RegExpTerm term) { any() }
|
|
|
|
/**
|
|
* Holds if the regex that `term` is part of is used in a way that ignores any trailing suffix of the input it's matched against.
|
|
* Not yet implemented for Python.
|
|
*/
|
|
predicate matchesAnySuffix(RegExpTerm term) { any() }
|
|
|
|
/**
|
|
* Holds if the regular expression should not be considered.
|
|
*
|
|
* We make the pragmatic performance optimization to ignore regular expressions in files
|
|
* that does not belong to the project code (such as installed dependencies).
|
|
*/
|
|
predicate isExcluded(RegExpParent parent) {
|
|
not exists(parent.getRegex().getLocation().getFile().getRelativePath())
|
|
or
|
|
// Regexes with many occurrences of ".*" may cause the polynomial ReDoS computation to explode, so
|
|
// we explicitly exclude these.
|
|
count(int i | exists(parent.getRegex().getText().regexpFind("\\.\\*", i, _)) | i) > 10
|
|
}
|
|
|
|
/**
|
|
* Holds if `root` has the `i` flag for case-insensitive matching.
|
|
*/
|
|
predicate isIgnoreCase(RegExpTerm root) {
|
|
root.isRootTerm() and
|
|
root.getLiteral().isIgnoreCase()
|
|
}
|
|
|
|
/**
|
|
* Holds if `root` has the `s` flag for multi-line matching.
|
|
*/
|
|
predicate isDotAll(RegExpTerm root) {
|
|
root.isRootTerm() and
|
|
root.getLiteral().isDotAll()
|
|
}
|
|
}
|