mirror of
https://github.com/github/codeql.git
synced 2025-12-22 19:56:32 +01:00
Although this wasn't (as far as I know) causing any performance issues,
it was making the join-order badness report quite noisy, and so I
figured it was worth fixing.
Before:
```
Tuple counts for RegexTreeView::RegExpBackRef::getGroup#dispred#f0820431#ff/2@d3441d0b after 84ms:
1501195 ~3% {2} r1 = JOIN RegexTreeView::RegExpTerm::getLiteral#dispred#f0820431#ff_10#join_rhs WITH RegexTreeView::RegExpTerm::getLiteral#dispred#f0820431#ff_10#join_rhs ON FIRST 1 OUTPUT Rhs.1 'result', Lhs.1 'result'
149 ~0% {5} r2 = JOIN r1 WITH RegexTreeView::RegExpBackRef#class#31aac2a7#ffff ON FIRST 1 OUTPUT Rhs.1, Rhs.2, Rhs.3, Lhs.1 'result', Lhs.0 'this'
149 ~1% {3} r3 = JOIN r2 WITH regex::RegexString::numbered_backreference#dispred#f0820431#ffff ON FIRST 3 OUTPUT Lhs.3 'result', Rhs.3, Lhs.4 'this'
4 ~0% {2} r4 = JOIN r3 WITH RegexTreeView::RegExpGroup::getNumber#dispred#f0820431#ff ON FIRST 2 OUTPUT Lhs.2 'this', Lhs.0 'result'
1501195 ~3% {2} r5 = JOIN RegexTreeView::RegExpTerm::getLiteral#dispred#f0820431#ff_10#join_rhs WITH RegexTreeView::RegExpTerm::getLiteral#dispred#f0820431#ff_10#join_rhs ON FIRST 1 OUTPUT Lhs.1 'result', Rhs.1 'result'
42526 ~0% {5} r6 = JOIN r5 WITH RegexTreeView::RegExpGroup#31aac2a7#ffff ON FIRST 1 OUTPUT Lhs.1 'this', Lhs.0 'result', Rhs.1, Rhs.2, Rhs.3
22 ~0% {8} r7 = JOIN r6 WITH RegexTreeView::RegExpBackRef#class#31aac2a7#ffff ON FIRST 1 OUTPUT Lhs.2, Lhs.3, Lhs.4, Lhs.1 'result', Lhs.0 'this', Rhs.1, Rhs.2, Rhs.3
0 ~0% {6} r8 = JOIN r7 WITH regex::RegexString::getGroupName#dispred#f0820431#ffff ON FIRST 3 OUTPUT Lhs.5, Lhs.6, Lhs.7, Rhs.3, Lhs.3 'result', Lhs.4 'this'
0 ~0% {2} r9 = JOIN r8 WITH regex::RegexString::named_backreference#dispred#f0820431#ffff ON FIRST 4 OUTPUT Lhs.5 'this', Lhs.4 'result'
4 ~0% {2} r10 = r4 UNION r9
return r10
```
In this case I opted for a classical solution: tying together the
literal and number (or name) part of the backreference in order to
encourage a two-column join.
After:
```
Tuple counts for RegexTreeView::RegExpBackRef::getGroup#dispred#f0820431#ff/2@b0cc4d5n after 0ms:
898 ~1% {3} r1 = JOIN RegexTreeView::RegExpTerm::getLiteral#dispred#f0820431#ff WITH RegexTreeView::RegExpGroup::getNumber#dispred#f0820431#ff ON FIRST 1 OUTPUT Lhs.1, Rhs.1, Lhs.0 'result'
4 ~0% {2} r2 = JOIN r1 WITH RegexTreeView::RegExpBackRef::hasLiteralAndNumber#f0820431#fff_120#join_rhs ON FIRST 2 OUTPUT Rhs.2 'this', Lhs.2 'result'
1110 ~0% {5} r3 = JOIN RegexTreeView::RegExpGroup#31aac2a7#ffff WITH RegexTreeView::RegExpTerm::getLiteral#dispred#f0820431#ff ON FIRST 1 OUTPUT Lhs.1, Lhs.2, Lhs.3, Lhs.0 'result', Rhs.1
146 ~0% {3} r4 = JOIN r3 WITH regex::RegexString::getGroupName#dispred#f0820431#ffff ON FIRST 3 OUTPUT Lhs.4, Rhs.3, Lhs.3 'result'
0 ~0% {2} r5 = JOIN r4 WITH RegexTreeView::RegExpBackRef::hasLiteralAndName#f0820431#fff_120#join_rhs ON FIRST 2 OUTPUT Rhs.2 'this', Lhs.2 'result'
4 ~0% {2} r6 = r2 UNION r5
return r6
```
1028 lines
26 KiB
Plaintext
1028 lines
26 KiB
Plaintext
/** Provides a class hierarchy corresponding to a parse tree of regular expressions. */
|
|
|
|
import python
|
|
private import semmle.python.regex
|
|
|
|
/**
|
|
* An element containing a regular expression term, that is, either
|
|
* a string literal (parsed as a regular expression)
|
|
* or another regular expression term.
|
|
*
|
|
* For sequences and alternations, we require at least one child.
|
|
* Otherwise, we wish to represent the term differently.
|
|
* This avoids multiple representations of the same term.
|
|
*/
|
|
newtype TRegExpParent =
|
|
/** A string literal used as a regular expression */
|
|
TRegExpLiteral(Regex re) or
|
|
/** A quantified term */
|
|
TRegExpQuantifier(Regex re, int start, int end) { re.qualifiedItem(start, end, _, _) } or
|
|
/** A sequence term */
|
|
TRegExpSequence(Regex re, int start, int end) {
|
|
re.sequence(start, end) and
|
|
exists(seqChild(re, start, end, 1)) // if a sequence does not have more than one element, it should be treated as that element instead.
|
|
} or
|
|
/** An alternation term */
|
|
TRegExpAlt(Regex re, int start, int end) {
|
|
re.alternation(start, end) and
|
|
exists(int part_end |
|
|
re.alternationOption(start, end, start, part_end) and
|
|
part_end < end
|
|
) // if an alternation does not have more than one element, it should be treated as that element instead.
|
|
} or
|
|
/** A character class term */
|
|
TRegExpCharacterClass(Regex re, int start, int end) { re.charSet(start, end) } or
|
|
/** A character range term */
|
|
TRegExpCharacterRange(Regex re, int start, int end) { re.charRange(_, start, _, _, end) } or
|
|
/** A group term */
|
|
TRegExpGroup(Regex re, int start, int end) { re.group(start, end) } or
|
|
/** A special character */
|
|
TRegExpSpecialChar(Regex re, int start, int end) { re.specialCharacter(start, end, _) } or
|
|
/** A normal character */
|
|
TRegExpNormalChar(Regex re, int start, int end) {
|
|
re.normalCharacterSequence(start, end)
|
|
or
|
|
re.escapedCharacter(start, end) and
|
|
not re.specialCharacter(start, end, _)
|
|
} or
|
|
/** A back reference */
|
|
TRegExpBackRef(Regex re, int start, int end) { re.backreference(start, end) }
|
|
|
|
/**
|
|
* An element containing a regular expression term, that is, either
|
|
* a string literal (parsed as a regular expression)
|
|
* or another regular expression term.
|
|
*/
|
|
class RegExpParent extends TRegExpParent {
|
|
/** Gets a textual representation of this element. */
|
|
string toString() { result = "RegExpParent" }
|
|
|
|
/** Gets the `i`th child term. */
|
|
abstract RegExpTerm getChild(int i);
|
|
|
|
/** Gets a child term . */
|
|
RegExpTerm getAChild() { result = this.getChild(_) }
|
|
|
|
/** Gets the number of child terms. */
|
|
int getNumChild() { result = count(this.getAChild()) }
|
|
|
|
/** Gets the associated regex. */
|
|
abstract Regex getRegex();
|
|
}
|
|
|
|
/** A string literal used as a regular expression */
|
|
class RegExpLiteral extends TRegExpLiteral, RegExpParent {
|
|
Regex re;
|
|
|
|
RegExpLiteral() { this = TRegExpLiteral(re) }
|
|
|
|
override RegExpTerm getChild(int i) { i = 0 and result.getRegex() = re and result.isRootTerm() }
|
|
|
|
/** Holds if dot, `.`, matches all characters, including newlines. */
|
|
predicate isDotAll() { re.getAMode() = "DOTALL" }
|
|
|
|
/** Holds if this regex matching is case-insensitive for this regex. */
|
|
predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" }
|
|
|
|
/** Get a string representing all modes for this regex. */
|
|
string getFlags() { result = concat(string mode | mode = re.getAMode() | mode, " | ") }
|
|
|
|
override Regex getRegex() { result = re }
|
|
|
|
/** Gets the primary QL class for this regex. */
|
|
string getPrimaryQLClass() { result = "RegExpLiteral" }
|
|
}
|
|
|
|
/**
|
|
* A regular expression term, that is, a syntactic part of a regular expression.
|
|
*/
|
|
class RegExpTerm extends RegExpParent {
|
|
Regex re;
|
|
int start;
|
|
int end;
|
|
|
|
RegExpTerm() {
|
|
this = TRegExpAlt(re, start, end)
|
|
or
|
|
this = TRegExpBackRef(re, start, end)
|
|
or
|
|
this = TRegExpCharacterClass(re, start, end)
|
|
or
|
|
this = TRegExpCharacterRange(re, start, end)
|
|
or
|
|
this = TRegExpNormalChar(re, start, end)
|
|
or
|
|
this = TRegExpGroup(re, start, end)
|
|
or
|
|
this = TRegExpQuantifier(re, start, end)
|
|
or
|
|
this = TRegExpSequence(re, start, end)
|
|
or
|
|
this = TRegExpSpecialChar(re, start, end)
|
|
}
|
|
|
|
/**
|
|
* Gets the outermost term of this regular expression.
|
|
*/
|
|
RegExpTerm getRootTerm() {
|
|
this.isRootTerm() and result = this
|
|
or
|
|
result = this.getParent().(RegExpTerm).getRootTerm()
|
|
}
|
|
|
|
/**
|
|
* Holds if this term is part of a string literal
|
|
* that is interpreted as a regular expression.
|
|
*/
|
|
predicate isUsedAsRegExp() { any() }
|
|
|
|
/**
|
|
* Holds if this is the root term of a regular expression.
|
|
*/
|
|
predicate isRootTerm() { start = 0 and end = re.getText().length() }
|
|
|
|
override RegExpTerm getChild(int i) {
|
|
result = this.(RegExpAlt).getChild(i)
|
|
or
|
|
result = this.(RegExpBackRef).getChild(i)
|
|
or
|
|
result = this.(RegExpCharacterClass).getChild(i)
|
|
or
|
|
result = this.(RegExpCharacterRange).getChild(i)
|
|
or
|
|
result = this.(RegExpNormalChar).getChild(i)
|
|
or
|
|
result = this.(RegExpGroup).getChild(i)
|
|
or
|
|
result = this.(RegExpQuantifier).getChild(i)
|
|
or
|
|
result = this.(RegExpSequence).getChild(i)
|
|
or
|
|
result = this.(RegExpSpecialChar).getChild(i)
|
|
}
|
|
|
|
/**
|
|
* Gets the parent term of this regular expression term, or the
|
|
* regular expression literal if this is the root term.
|
|
*/
|
|
RegExpParent getParent() { result.getAChild() = this }
|
|
|
|
override Regex getRegex() { result = re }
|
|
|
|
/** Gets the offset at which this term starts. */
|
|
int getStart() { result = start }
|
|
|
|
/** Gets the offset at which this term ends. */
|
|
int getEnd() { result = end }
|
|
|
|
override string toString() { result = re.getText().substring(start, end) }
|
|
|
|
/**
|
|
* Gets the location of the surrounding regex, as locations inside the regex do not exist.
|
|
* To get location information corresponding to the term inside the regex,
|
|
* use `hasLocationInfo`.
|
|
*/
|
|
Location getLocation() { result = re.getLocation() }
|
|
|
|
/** Holds if this term is found at the specified location offsets. */
|
|
predicate hasLocationInfo(
|
|
string filepath, int startline, int startcolumn, int endline, int endcolumn
|
|
) {
|
|
exists(int re_start, int re_end |
|
|
re.getLocation().hasLocationInfo(filepath, startline, re_start, endline, re_end) and
|
|
startcolumn = re_start + start + 4 and
|
|
endcolumn = re_start + end + 3
|
|
)
|
|
}
|
|
|
|
/** Gets the file in which this term is found. */
|
|
File getFile() { result = this.getLocation().getFile() }
|
|
|
|
/** Gets the raw source text of this term. */
|
|
string getRawValue() { result = this.toString() }
|
|
|
|
/** Gets the string literal in which this term is found. */
|
|
RegExpLiteral getLiteral() { result = TRegExpLiteral(re) }
|
|
|
|
/** Gets the regular expression term that is matched (textually) before this one, if any. */
|
|
RegExpTerm getPredecessor() {
|
|
exists(RegExpTerm parent | parent = this.getParent() |
|
|
result = parent.(RegExpSequence).previousElement(this)
|
|
or
|
|
not exists(parent.(RegExpSequence).previousElement(this)) and
|
|
not parent instanceof RegExpSubPattern and
|
|
result = parent.getPredecessor()
|
|
)
|
|
}
|
|
|
|
/** Gets the regular expression term that is matched (textually) after this one, if any. */
|
|
RegExpTerm getSuccessor() {
|
|
exists(RegExpTerm parent | parent = this.getParent() |
|
|
result = parent.(RegExpSequence).nextElement(this)
|
|
or
|
|
not exists(parent.(RegExpSequence).nextElement(this)) and
|
|
not parent instanceof RegExpSubPattern and
|
|
result = parent.getSuccessor()
|
|
)
|
|
}
|
|
|
|
/** Gets the primary QL class for this term. */
|
|
string getPrimaryQLClass() { result = "RegExpTerm" }
|
|
}
|
|
|
|
/**
|
|
* A quantified regular expression term.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* ((ECMA|Java)[sS]cript)*
|
|
* ```
|
|
*/
|
|
class RegExpQuantifier extends RegExpTerm, TRegExpQuantifier {
|
|
int part_end;
|
|
boolean maybe_empty;
|
|
boolean may_repeat_forever;
|
|
|
|
RegExpQuantifier() {
|
|
this = TRegExpQuantifier(re, start, end) and
|
|
re.qualifiedPart(start, part_end, end, maybe_empty, may_repeat_forever)
|
|
}
|
|
|
|
override RegExpTerm getChild(int i) {
|
|
i = 0 and
|
|
result.getRegex() = re and
|
|
result.getStart() = start and
|
|
result.getEnd() = part_end
|
|
}
|
|
|
|
/** Hols if this term may match an unlimited number of times. */
|
|
predicate mayRepeatForever() { may_repeat_forever = true }
|
|
|
|
/** Gets the qualifier for this term. That is e.g "?" for "a?". */
|
|
string getQualifier() { result = re.getText().substring(part_end, end) }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpQuantifier" }
|
|
}
|
|
|
|
/**
|
|
* A regular expression term that permits unlimited repetitions.
|
|
*/
|
|
class InfiniteRepetitionQuantifier extends RegExpQuantifier {
|
|
InfiniteRepetitionQuantifier() { this.mayRepeatForever() }
|
|
}
|
|
|
|
/**
|
|
* A star-quantified term.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* \w*
|
|
* ```
|
|
*/
|
|
class RegExpStar extends InfiniteRepetitionQuantifier {
|
|
RegExpStar() { this.getQualifier().charAt(0) = "*" }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpStar" }
|
|
}
|
|
|
|
/**
|
|
* A plus-quantified term.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* \w+
|
|
* ```
|
|
*/
|
|
class RegExpPlus extends InfiniteRepetitionQuantifier {
|
|
RegExpPlus() { this.getQualifier().charAt(0) = "+" }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpPlus" }
|
|
}
|
|
|
|
/**
|
|
* An optional term.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* ;?
|
|
* ```
|
|
*/
|
|
class RegExpOpt extends RegExpQuantifier {
|
|
RegExpOpt() { this.getQualifier().charAt(0) = "?" }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpOpt" }
|
|
}
|
|
|
|
/**
|
|
* A range-quantified term
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* \w{2,4}
|
|
* \w{2,}
|
|
* \w{2}
|
|
* ```
|
|
*/
|
|
class RegExpRange extends RegExpQuantifier {
|
|
string upper;
|
|
string lower;
|
|
|
|
RegExpRange() { re.multiples(part_end, end, lower, upper) }
|
|
|
|
/** Gets the string defining the upper bound of this range, if any. */
|
|
string getUpper() { result = upper }
|
|
|
|
/** Gets the string defining the lower bound of this range, if any. */
|
|
string getLower() { result = lower }
|
|
|
|
/**
|
|
* Gets the upper bound of the range, if any.
|
|
*
|
|
* If there is no upper bound, any number of repetitions is allowed.
|
|
* For a term of the form `r{lo}`, both the lower and the upper bound
|
|
* are `lo`.
|
|
*/
|
|
int getUpperBound() { result = this.getUpper().toInt() }
|
|
|
|
/** Gets the lower bound of the range. */
|
|
int getLowerBound() { result = this.getLower().toInt() }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpRange" }
|
|
}
|
|
|
|
/**
|
|
* A sequence term.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* (ECMA|Java)Script
|
|
* ```
|
|
*
|
|
* This is a sequence with the elements `(ECMA|Java)` and `Script`.
|
|
*/
|
|
class RegExpSequence extends RegExpTerm, TRegExpSequence {
|
|
RegExpSequence() { this = TRegExpSequence(re, start, end) }
|
|
|
|
override RegExpTerm getChild(int i) { result = seqChild(re, start, end, i) }
|
|
|
|
/** Gets the element preceding `element` in this sequence. */
|
|
RegExpTerm previousElement(RegExpTerm element) { element = this.nextElement(result) }
|
|
|
|
/** Gets the element following `element` in this sequence. */
|
|
RegExpTerm nextElement(RegExpTerm element) {
|
|
exists(int i |
|
|
element = this.getChild(i) and
|
|
result = this.getChild(i + 1)
|
|
)
|
|
}
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpSequence" }
|
|
}
|
|
|
|
pragma[nomagic]
|
|
private int seqChildEnd(Regex re, int start, int end, int i) {
|
|
result = seqChild(re, start, end, i).getEnd()
|
|
}
|
|
|
|
// moved out so we can use it in the charpred
|
|
private RegExpTerm seqChild(Regex re, int start, int end, int i) {
|
|
re.sequence(start, end) and
|
|
(
|
|
i = 0 and
|
|
result.getRegex() = re and
|
|
result.getStart() = start and
|
|
exists(int itemEnd |
|
|
re.item(start, itemEnd) and
|
|
result.getEnd() = itemEnd
|
|
)
|
|
or
|
|
i > 0 and
|
|
result.getRegex() = re and
|
|
exists(int itemStart | itemStart = seqChildEnd(re, start, end, i - 1) |
|
|
result.getStart() = itemStart and
|
|
re.item(itemStart, result.getEnd())
|
|
)
|
|
)
|
|
}
|
|
|
|
/**
|
|
* An alternative term, that is, a term of the form `a|b`.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* ECMA|Java
|
|
* ```
|
|
*/
|
|
class RegExpAlt extends RegExpTerm, TRegExpAlt {
|
|
RegExpAlt() { this = TRegExpAlt(re, start, end) }
|
|
|
|
override RegExpTerm getChild(int i) {
|
|
i = 0 and
|
|
result.getRegex() = re and
|
|
result.getStart() = start and
|
|
exists(int part_end |
|
|
re.alternationOption(start, end, start, part_end) and
|
|
result.getEnd() = part_end
|
|
)
|
|
or
|
|
i > 0 and
|
|
result.getRegex() = re and
|
|
exists(int part_start |
|
|
part_start = this.getChild(i - 1).getEnd() + 1 // allow for the |
|
|
|
|
|
result.getStart() = part_start and
|
|
re.alternationOption(start, end, part_start, result.getEnd())
|
|
)
|
|
}
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpAlt" }
|
|
}
|
|
|
|
class RegExpCharEscape = RegExpEscape;
|
|
|
|
/**
|
|
* An escaped regular expression term, that is, a regular expression
|
|
* term starting with a backslash, which is not a backreference.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* \.
|
|
* \w
|
|
* ```
|
|
*/
|
|
class RegExpEscape extends RegExpNormalChar {
|
|
RegExpEscape() { re.escapedCharacter(start, end) }
|
|
|
|
/**
|
|
* Gets the name of the escaped; for example, `w` for `\w`.
|
|
* TODO: Handle named escapes.
|
|
*/
|
|
override string getValue() {
|
|
this.isIdentityEscape() and result = this.getUnescaped()
|
|
or
|
|
this.getUnescaped() = "n" and result = "\n"
|
|
or
|
|
this.getUnescaped() = "r" and result = "\r"
|
|
or
|
|
this.getUnescaped() = "t" and result = "\t"
|
|
or
|
|
this.getUnescaped() = "f" and result = 12.toUnicode()
|
|
or
|
|
this.getUnescaped() = "v" and result = 11.toUnicode()
|
|
or
|
|
this.isUnicode() and
|
|
result = this.getUnicode()
|
|
}
|
|
|
|
/** Holds if this terms name is given by the part following the escape character. */
|
|
predicate isIdentityEscape() { not this.getUnescaped() in ["n", "r", "t", "f"] }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpEscape" }
|
|
|
|
/** Gets the part of the term following the escape character. That is e.g. "w" if the term is "\w". */
|
|
string getUnescaped() { result = this.getText().suffix(1) }
|
|
|
|
/**
|
|
* Gets the text for this escape. That is e.g. "\w".
|
|
*/
|
|
private string getText() { result = re.getText().substring(start, end) }
|
|
|
|
/**
|
|
* Holds if this is a unicode escape.
|
|
*/
|
|
private predicate isUnicode() { this.getText().prefix(2) = ["\\u", "\\U"] }
|
|
|
|
/**
|
|
* Gets the unicode char for this escape.
|
|
* E.g. for `\u0061` this returns "a".
|
|
*/
|
|
private string getUnicode() {
|
|
exists(int codepoint | codepoint = sum(this.getHexValueFromUnicode(_)) |
|
|
result = codepoint.toUnicode()
|
|
)
|
|
}
|
|
|
|
/**
|
|
* Gets int value for the `index`th char in the hex number of the unicode escape.
|
|
* E.g. for `\u0061` and `index = 2` this returns 96 (the number `6` interpreted as hex).
|
|
*/
|
|
private int getHexValueFromUnicode(int index) {
|
|
this.isUnicode() and
|
|
exists(string hex, string char | hex = this.getText().suffix(2) |
|
|
char = hex.charAt(index) and
|
|
result = 16.pow(hex.length() - index - 1) * toHex(char)
|
|
)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Gets the hex number for the `hex` char.
|
|
*/
|
|
private int toHex(string hex) {
|
|
hex = [0 .. 9].toString() and
|
|
result = hex.toInt()
|
|
or
|
|
result = 10 and hex = ["a", "A"]
|
|
or
|
|
result = 11 and hex = ["b", "B"]
|
|
or
|
|
result = 12 and hex = ["c", "C"]
|
|
or
|
|
result = 13 and hex = ["d", "D"]
|
|
or
|
|
result = 14 and hex = ["e", "E"]
|
|
or
|
|
result = 15 and hex = ["f", "F"]
|
|
}
|
|
|
|
/**
|
|
* A word boundary, that is, a regular expression term of the form `\b`.
|
|
*/
|
|
class RegExpWordBoundary extends RegExpSpecialChar {
|
|
RegExpWordBoundary() { this.getChar() = "\\b" }
|
|
}
|
|
|
|
/**
|
|
* A character class escape in a regular expression.
|
|
* That is, an escaped character that denotes multiple characters.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* \w
|
|
* \S
|
|
* ```
|
|
*/
|
|
class RegExpCharacterClassEscape extends RegExpEscape {
|
|
RegExpCharacterClassEscape() { this.getValue() in ["d", "D", "s", "S", "w", "W"] }
|
|
|
|
override RegExpTerm getChild(int i) { none() }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpCharacterClassEscape" }
|
|
}
|
|
|
|
/**
|
|
* A character class in a regular expression.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* [a-z_]
|
|
* [^<>&]
|
|
* ```
|
|
*/
|
|
class RegExpCharacterClass extends RegExpTerm, TRegExpCharacterClass {
|
|
RegExpCharacterClass() { this = TRegExpCharacterClass(re, start, end) }
|
|
|
|
/** Holds if this character class is inverted, matching the opposite of its content. */
|
|
predicate isInverted() { re.getChar(start + 1) = "^" }
|
|
|
|
/** Gets the `i`th char inside this charater class. */
|
|
string getCharThing(int i) { result = re.getChar(i + start) }
|
|
|
|
/** Holds if this character class can match anything. */
|
|
predicate isUniversalClass() {
|
|
// [^]
|
|
this.isInverted() and not exists(this.getAChild())
|
|
or
|
|
// [\w\W] and similar
|
|
not this.isInverted() and
|
|
exists(string cce1, string cce2 |
|
|
cce1 = this.getAChild().(RegExpCharacterClassEscape).getValue() and
|
|
cce2 = this.getAChild().(RegExpCharacterClassEscape).getValue()
|
|
|
|
|
cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase()
|
|
)
|
|
}
|
|
|
|
override RegExpTerm getChild(int i) {
|
|
i = 0 and
|
|
result.getRegex() = re and
|
|
exists(int itemStart, int itemEnd |
|
|
result.getStart() = itemStart and
|
|
re.char_set_start(start, itemStart) and
|
|
re.char_set_child(start, itemStart, itemEnd) and
|
|
result.getEnd() = itemEnd
|
|
)
|
|
or
|
|
i > 0 and
|
|
result.getRegex() = re and
|
|
exists(int itemStart | itemStart = this.getChild(i - 1).getEnd() |
|
|
result.getStart() = itemStart and
|
|
re.char_set_child(start, itemStart, result.getEnd())
|
|
)
|
|
}
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpCharacterClass" }
|
|
}
|
|
|
|
/**
|
|
* A character range in a character class in a regular expression.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* a-z
|
|
* ```
|
|
*/
|
|
class RegExpCharacterRange extends RegExpTerm, TRegExpCharacterRange {
|
|
int lower_end;
|
|
int upper_start;
|
|
|
|
RegExpCharacterRange() {
|
|
this = TRegExpCharacterRange(re, start, end) and
|
|
re.charRange(_, start, lower_end, upper_start, end)
|
|
}
|
|
|
|
/** Holds if this range goes from `lo` to `hi`, in effect is `lo-hi`. */
|
|
predicate isRange(string lo, string hi) {
|
|
lo = re.getText().substring(start, lower_end) and
|
|
hi = re.getText().substring(upper_start, end)
|
|
}
|
|
|
|
override RegExpTerm getChild(int i) {
|
|
i = 0 and
|
|
result.getRegex() = re and
|
|
result.getStart() = start and
|
|
result.getEnd() = lower_end
|
|
or
|
|
i = 1 and
|
|
result.getRegex() = re and
|
|
result.getStart() = upper_start and
|
|
result.getEnd() = end
|
|
}
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpCharacterRange" }
|
|
}
|
|
|
|
/**
|
|
* A normal character in a regular expression, that is, a character
|
|
* without special meaning. This includes escaped characters.
|
|
*
|
|
* Examples:
|
|
* ```
|
|
* t
|
|
* \t
|
|
* ```
|
|
*/
|
|
class RegExpNormalChar extends RegExpTerm, TRegExpNormalChar {
|
|
RegExpNormalChar() { this = TRegExpNormalChar(re, start, end) }
|
|
|
|
/**
|
|
* Holds if this constant represents a valid Unicode character (as opposed
|
|
* to a surrogate code point that does not correspond to a character by itself.)
|
|
*/
|
|
predicate isCharacter() { any() }
|
|
|
|
/** Gets the string representation of the char matched by this term. */
|
|
string getValue() { result = re.getText().substring(start, end) }
|
|
|
|
override RegExpTerm getChild(int i) { none() }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpNormalChar" }
|
|
}
|
|
|
|
/**
|
|
* A constant regular expression term, that is, a regular expression
|
|
* term matching a single string. Currently, this will always be a single character.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* a
|
|
* ```
|
|
*/
|
|
class RegExpConstant extends RegExpTerm {
|
|
string value;
|
|
|
|
RegExpConstant() {
|
|
this = TRegExpNormalChar(re, start, end) and
|
|
not this instanceof RegExpCharacterClassEscape and
|
|
// exclude chars in qualifiers
|
|
// TODO: push this into regex library
|
|
not exists(int qstart, int qend | re.qualifiedPart(_, qstart, qend, _, _) |
|
|
qstart <= start and end <= qend
|
|
) and
|
|
value = this.(RegExpNormalChar).getValue()
|
|
}
|
|
|
|
/**
|
|
* Holds if this constant represents a valid Unicode character (as opposed
|
|
* to a surrogate code point that does not correspond to a character by itself.)
|
|
*/
|
|
predicate isCharacter() { any() }
|
|
|
|
/** Gets the string matched by this constant term. */
|
|
string getValue() { result = value }
|
|
|
|
override RegExpTerm getChild(int i) { none() }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpConstant" }
|
|
}
|
|
|
|
/**
|
|
* A grouped regular expression.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (ECMA|Java)
|
|
* (?:ECMA|Java)
|
|
* (?<quote>['"])
|
|
* ```
|
|
*/
|
|
class RegExpGroup extends RegExpTerm, TRegExpGroup {
|
|
RegExpGroup() { this = TRegExpGroup(re, start, end) }
|
|
|
|
/**
|
|
* Gets the index of this capture group within the enclosing regular
|
|
* expression literal.
|
|
*
|
|
* For example, in the regular expression `/((a?).)(?:b)/`, the
|
|
* group `((a?).)` has index 1, the group `(a?)` nested inside it
|
|
* has index 2, and the group `(?:b)` has no index, since it is
|
|
* not a capture group.
|
|
*/
|
|
int getNumber() { result = re.getGroupNumber(start, end) }
|
|
|
|
/** Holds if this is a capture group. */
|
|
predicate isCapture() { exists(this.getNumber()) }
|
|
|
|
/** Holds if this is a named capture group. */
|
|
predicate isNamed() { exists(this.getName()) }
|
|
|
|
/** Gets the name of this capture group, if any. */
|
|
string getName() { result = re.getGroupName(start, end) }
|
|
|
|
override RegExpTerm getChild(int i) {
|
|
result.getRegex() = re and
|
|
i = 0 and
|
|
re.groupContents(start, end, result.getStart(), result.getEnd())
|
|
}
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpGroup" }
|
|
}
|
|
|
|
/**
|
|
* A special character in a regular expression.
|
|
*
|
|
* Examples:
|
|
* ```
|
|
* ^
|
|
* $
|
|
* .
|
|
* ```
|
|
*/
|
|
class RegExpSpecialChar extends RegExpTerm, TRegExpSpecialChar {
|
|
string char;
|
|
|
|
RegExpSpecialChar() {
|
|
this = TRegExpSpecialChar(re, start, end) and
|
|
re.specialCharacter(start, end, char)
|
|
}
|
|
|
|
/**
|
|
* Holds if this constant represents a valid Unicode character (as opposed
|
|
* to a surrogate code point that does not correspond to a character by itself.)
|
|
*/
|
|
predicate isCharacter() { any() }
|
|
|
|
/** Gets the char for this term. */
|
|
string getChar() { result = char }
|
|
|
|
override RegExpTerm getChild(int i) { none() }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpSpecialChar" }
|
|
}
|
|
|
|
/**
|
|
* A dot regular expression.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* .
|
|
* ```
|
|
*/
|
|
class RegExpDot extends RegExpSpecialChar {
|
|
RegExpDot() { this.getChar() = "." }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpDot" }
|
|
}
|
|
|
|
/**
|
|
* A dollar assertion `$` or `\Z` matching the end of a line.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* $
|
|
* ```
|
|
*/
|
|
class RegExpDollar extends RegExpSpecialChar {
|
|
RegExpDollar() { this.getChar() = ["$", "\\Z"] }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpDollar" }
|
|
}
|
|
|
|
/**
|
|
* A caret assertion `^` or `\A` matching the beginning of a line.
|
|
*
|
|
* Example:
|
|
*
|
|
* ```
|
|
* ^
|
|
* ```
|
|
*/
|
|
class RegExpCaret extends RegExpSpecialChar {
|
|
RegExpCaret() { this.getChar() = ["^", "\\A"] }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpCaret" }
|
|
}
|
|
|
|
/**
|
|
* A zero-width match, that is, either an empty group or an assertion.
|
|
*
|
|
* Examples:
|
|
* ```
|
|
* ()
|
|
* (?=\w)
|
|
* ```
|
|
*/
|
|
class RegExpZeroWidthMatch extends RegExpGroup {
|
|
RegExpZeroWidthMatch() { re.zeroWidthMatch(start, end) }
|
|
|
|
override RegExpTerm getChild(int i) { none() }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpZeroWidthMatch" }
|
|
}
|
|
|
|
/**
|
|
* A zero-width lookahead or lookbehind assertion.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (?=\w)
|
|
* (?!\n)
|
|
* (?<=\.)
|
|
* (?<!\\)
|
|
* ```
|
|
*/
|
|
class RegExpSubPattern extends RegExpZeroWidthMatch {
|
|
RegExpSubPattern() { not re.emptyGroup(start, end) }
|
|
|
|
/** Gets the lookahead term. */
|
|
RegExpTerm getOperand() {
|
|
exists(int in_start, int in_end | re.groupContents(start, end, in_start, in_end) |
|
|
result.getRegex() = re and
|
|
result.getStart() = in_start and
|
|
result.getEnd() = in_end
|
|
)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* A zero-width lookahead assertion.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (?=\w)
|
|
* (?!\n)
|
|
* ```
|
|
*/
|
|
abstract class RegExpLookahead extends RegExpSubPattern { }
|
|
|
|
/**
|
|
* A positive-lookahead assertion.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (?=\w)
|
|
* ```
|
|
*/
|
|
class RegExpPositiveLookahead extends RegExpLookahead {
|
|
RegExpPositiveLookahead() { re.positiveLookaheadAssertionGroup(start, end) }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpPositiveLookahead" }
|
|
}
|
|
|
|
/**
|
|
* A negative-lookahead assertion.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (?!\n)
|
|
* ```
|
|
*/
|
|
class RegExpNegativeLookahead extends RegExpLookahead {
|
|
RegExpNegativeLookahead() { re.negativeLookaheadAssertionGroup(start, end) }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpNegativeLookahead" }
|
|
}
|
|
|
|
/**
|
|
* A zero-width lookbehind assertion.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (?<=\.)
|
|
* (?<!\\)
|
|
* ```
|
|
*/
|
|
abstract class RegExpLookbehind extends RegExpSubPattern { }
|
|
|
|
/**
|
|
* A positive-lookbehind assertion.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (?<=\.)
|
|
* ```
|
|
*/
|
|
class RegExpPositiveLookbehind extends RegExpLookbehind {
|
|
RegExpPositiveLookbehind() { re.positiveLookbehindAssertionGroup(start, end) }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpPositiveLookbehind" }
|
|
}
|
|
|
|
/**
|
|
* A negative-lookbehind assertion.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* (?<!\\)
|
|
* ```
|
|
*/
|
|
class RegExpNegativeLookbehind extends RegExpLookbehind {
|
|
RegExpNegativeLookbehind() { re.negativeLookbehindAssertionGroup(start, end) }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpNegativeLookbehind" }
|
|
}
|
|
|
|
/**
|
|
* A back reference, that is, a term of the form `\i` or `\k<name>`
|
|
* in a regular expression.
|
|
*
|
|
* Examples:
|
|
*
|
|
* ```
|
|
* \1
|
|
* (?P=quote)
|
|
* ```
|
|
*/
|
|
class RegExpBackRef extends RegExpTerm, TRegExpBackRef {
|
|
RegExpBackRef() { this = TRegExpBackRef(re, start, end) }
|
|
|
|
/**
|
|
* Gets the number of the capture group this back reference refers to, if any.
|
|
*/
|
|
int getNumber() { result = re.getBackrefNumber(start, end) }
|
|
|
|
/**
|
|
* Gets the name of the capture group this back reference refers to, if any.
|
|
*/
|
|
string getName() { result = re.getBackrefName(start, end) }
|
|
|
|
/** Gets the capture group this back reference refers to. */
|
|
RegExpGroup getGroup() {
|
|
this.hasLiteralAndNumber(result.getLiteral(), result.getNumber()) or
|
|
this.hasLiteralAndName(result.getLiteral(), result.getName())
|
|
}
|
|
|
|
/** Join-order helper for `getGroup`. */
|
|
pragma[nomagic]
|
|
private predicate hasLiteralAndNumber(RegExpLiteral literal, int number) {
|
|
literal = this.getLiteral() and
|
|
number = this.getNumber()
|
|
}
|
|
|
|
/** Join-order helper for `getGroup`. */
|
|
pragma[nomagic]
|
|
private predicate hasLiteralAndName(RegExpLiteral literal, string name) {
|
|
literal = this.getLiteral() and
|
|
name = this.getName()
|
|
}
|
|
|
|
override RegExpTerm getChild(int i) { none() }
|
|
|
|
override string getPrimaryQLClass() { result = "RegExpBackRef" }
|
|
}
|
|
|
|
/** Gets the parse tree resulting from parsing `re`, if such has been constructed. */
|
|
RegExpTerm getParsedRegExp(StrConst re) { result.getRegex() = re and result.isRootTerm() }
|