Files
codeql/javascript/ql/lib/semmle/javascript/Regexp.qll
2025-11-13 09:46:14 +01:00

1447 lines
39 KiB
Plaintext

/**
* Provides classes for working with regular expressions.
*
* Regular expression literals are represented as an abstract syntax tree of regular expression
* terms.
*/
overlay[local]
module;
import javascript
private import semmle.javascript.dataflow.InferredTypes
private import semmle.javascript.internal.CachedStages
/**
* An element containing a regular expression term, that is, either
* a regular expression literal, a string literal (parsed as a regular expression),
* or another regular expression term.
*
* Examples:
*
* ```
* // the regular expression literal and all terms it contains are regexp parents
* /((ECMA|Java)[sS]cript)*$/
* ```
*/
class RegExpParent extends Locatable, @regexpparent { }
/**
* A regular expression term, that is, a syntactic part of a regular expression.
*
* Regular expression terms may occur as part of a regular expression literal,
* such as `/[a-z]+/`, or as part of a string literal, such as `"[a-z]+"`.
*
* Note that some terms will occur as part of a string literal that isn't actually
* interpreted as regular expression at runtime. Use `isPartOfRegExpLiteral`
* or `isUsedAsRegExp` to check if a term is really used as a regular expression.
*
* Examples:
*
* ```
* ((ECMA|Java)[sS]cript)*$
* ((ECMA|Java)[sS]cript)*
* (ECMA|Java)
* $
* ```
*/
class RegExpTerm extends Locatable, @regexpterm {
/** Gets the `i`th child term of this term. */
RegExpTerm getChild(int i) { regexpterm(result, _, this, i, _) }
/** Gets a child term of this term. */
RegExpTerm getAChild() { result = this.getChild(_) }
/** Gets the number of child terms of this term. */
int getNumChild() { result = count(this.getAChild()) }
/** Gets the last child term of this term. */
RegExpTerm getLastChild() { result = this.getChild(this.getNumChild() - 1) }
/**
* Gets the parent term of this regular expression term, or the
* regular expression literal if this is the root term.
*/
RegExpParent getParent() { regexpterm(this, _, result, _, _) }
/** Gets the regular expression literal this term belongs to, if any. */
RegExpLiteral getLiteral() { result = this.getRootTerm().getParent() }
override string toString() { regexpterm(this, _, _, _, result) }
/** Gets the raw source text of this term. */
string getRawValue() { regexpterm(this, _, _, _, result) }
/** Holds if this regular expression term can match the empty string. */
predicate isNullable() { none() } // Overridden in subclasses.
/** Gets the regular expression term that is matched (textually) before this one, if any. */
RegExpTerm getPredecessor() {
exists(RegExpTerm parent | parent = this.getParent() |
result = parent.(RegExpSequence).previousElement(this)
or
not exists(parent.(RegExpSequence).previousElement(this)) and
not parent instanceof RegExpSubPattern and
result = parent.getPredecessor()
)
}
/** Gets the regular expression term that is matched (textually) after this one, if any. */
RegExpTerm getSuccessor() {
exists(RegExpTerm parent | parent = this.getParent() |
result = parent.(RegExpSequence).nextElement(this)
or
not exists(parent.(RegExpSequence).nextElement(this)) and
not parent instanceof RegExpSubPattern and
result = parent.getSuccessor()
)
}
/**
* Holds if this regular term is in a forward-matching context, that is,
* it has no enclosing lookbehind assertions.
*/
predicate isInForwardMatchingContext() { not this.isInBackwardMatchingContext() }
/**
* Holds if this regular term is in a backward-matching context, that is,
* it has an enclosing lookbehind assertions.
*/
predicate isInBackwardMatchingContext() { this = any(RegExpLookbehind lbh).getAChild+() }
/**
* Holds if this is the root term of a regular expression.
*/
predicate isRootTerm() { not this.getParent() instanceof RegExpTerm }
/**
* Gets the outermost term of this regular expression.
*/
RegExpTerm getRootTerm() {
this.isRootTerm() and
result = this
or
result = this.getParent().(RegExpTerm).getRootTerm()
}
/**
* Holds if this term occurs as part of a regular expression literal.
*/
predicate isPartOfRegExpLiteral() { exists(this.getLiteral()) }
/**
* Holds if this term occurs as part of a string literal.
*
* This predicate holds regardless of whether the string literal is actually
* used as a regular expression. See `isUsedAsRegExp`.
*/
predicate isPartOfStringLiteral() { this.getRootTerm().getParent() instanceof StringLiteral }
/**
* Holds if this term is part of a regular expression literal, or a string literal
* that is interpreted as a regular expression.
*
* Unlike `isPartOfRegExpLiteral` and `isPartOfStringLiteral`, this predicate takes
* data flow into account, to exclude string literals that aren't used as regular expressions.
*
* For example:
* ```js
* location.href.match("^https://example\\.com/") // YES - String is used as regexpp
*
* console.log("Hello world"); // NO - string is not used as regexp
*
* /[a-z]+/g; // YES - Regexp literals are always used as regexp
* ```
*/
overlay[global]
predicate isUsedAsRegExp() {
exists(RegExpParent parent | parent = this.getRootTerm().getParent() |
parent instanceof RegExpLiteral
or
parent.(Expr).flow() instanceof RegExpPatternSource
)
}
/**
* Gets the single string this regular-expression term matches.
*
* This predicate is only defined for (sequences/groups of) constant regular expressions.
* In particular, terms involving zero-width assertions like `^` or `\b` are not considered
* to have a constant value.
*
* Note that this predicate does not take flags of the enclosing regular-expression literal
* into account.
*/
string getConstantValue() { none() }
/**
* Gets a string that is matched by this regular-expression term.
*/
string getAMatchedString() { result = this.getConstantValue() }
/** Holds if this term has the specified location. */
predicate hasLocationInfo(
string filepath, int startline, int startcolumn, int endline, int endcolumn
) {
this.getLocation().hasLocationInfo(filepath, startline, startcolumn, endline, endcolumn)
}
}
/**
* A quantified regular expression term.
*
* Example:
*
* ```
* ((ECMA|Java)[sS]cript)*
* ```
*/
class RegExpQuantifier extends RegExpTerm, @regexp_quantifier {
/** Holds if the quantifier of this term is a greedy quantifier. */
predicate isGreedy() { is_greedy(this) }
}
/**
* A regular expression term that permits unlimited repetitions.
*/
class InfiniteRepetitionQuantifier extends RegExpQuantifier {
InfiniteRepetitionQuantifier() {
this instanceof RegExpPlus
or
this instanceof RegExpStar
or
this instanceof RegExpRange and not exists(this.(RegExpRange).getUpperBound())
}
}
/**
* An escaped regular expression term, that is, a regular expression
* term starting with a backslash.
*
* Example:
*
* ```
* \.
* \w
* ```
*/
class RegExpEscape extends RegExpTerm, @regexp_escape {
override string getAPrimaryQlClass() { result = "RegExpEscape" }
}
/**
* A constant regular expression term, that is, a regular expression
* term matching a single string.
*
* Example:
*
* ```
* abc
* ```
*/
class RegExpConstant extends RegExpTerm, @regexp_constant {
/** Gets the string matched by this constant term. */
string getValue() { regexp_const_value(this, result) }
/**
* Holds if this constant represents a valid Unicode character (as opposed
* to a surrogate code point that does not correspond to a character by itself.)
*/
predicate isCharacter() { any() }
override predicate isNullable() { none() }
override string getConstantValue() { result = this.getValue() }
override string getAPrimaryQlClass() { result = "RegExpConstant" }
}
/**
* A character escape in a regular expression.
*
* Example:
*
* ```
* \.
* ```
*/
class RegExpCharEscape extends RegExpEscape, RegExpConstant, @regexp_char_escape {
override predicate isCharacter() {
not (
// unencodable characters are represented as '?' or \uFFFD in the database
this.getValue() = ["?", 65533.toUnicode()] and
exists(string s | s = this.toString().toLowerCase() |
// only Unicode escapes give rise to unencodable characters
s.matches("\\\\u%") and
// but '\u003f' actually is the '?' character itself
s != "\\u003f"
)
)
}
override string getAPrimaryQlClass() { result = "RegExpCharEscape" }
}
/**
* An alternative term, that is, a term of the form `a|b`.
*
* Example:
*
* ```
* ECMA|Java
* ```
*/
class RegExpAlt extends RegExpTerm, @regexp_alt {
/** Gets an alternative of this term. */
RegExpTerm getAlternative() { result = this.getAChild() }
/** Gets the number of alternatives of this term. */
int getNumAlternative() { result = this.getNumChild() }
override predicate isNullable() { this.getAlternative().isNullable() }
override string getAMatchedString() { result = this.getAlternative().getAMatchedString() }
override string getAPrimaryQlClass() { result = "RegExpAlt" }
}
/**
* An intersection term, that is, a term of the form `[[a]&&[ab]]`.
*
* Example:
*
* ```
* /[[abc]&&[bcd]]/v - which matches 'b' and 'c' only.
* ```
*/
class RegExpIntersection extends RegExpTerm, @regexp_intersection {
/** Gets an intersected term of this term. */
RegExpTerm getAnElement() { result = this.getAChild() }
/** Gets the number of intersected terms of this term. */
int getNumIntersectedTerm() { result = this.getNumChild() }
override predicate isNullable() { this.getAnElement().isNullable() }
override string getAPrimaryQlClass() { result = "RegExpIntersection" }
}
/**
* A subtraction term, that is, a term of the form `[[a]--[ab]]`.
*
* Example:
*
* ```
* /[[abc]--[bc]]/v - which matches 'a' only.
* ```
*/
class RegExpSubtraction extends RegExpTerm, @regexp_subtraction {
/** Gets the minuend (left operand) of this subtraction. */
RegExpTerm getFirstTerm() { result = this.getChild(0) }
/** Gets the number of subtractions terms of this term. */
int getNumSubtractedTerm() { result = this.getNumChild() - 1 }
/** Gets a subtrahend (right operand) of this subtraction. */
RegExpTerm getASubtractedTerm() { exists(int i | i > 0 and result = this.getChild(i)) }
override predicate isNullable() { none() }
override string getAPrimaryQlClass() { result = "RegExpSubtraction" }
}
/**
* A sequence term.
*
* Example:
*
* ```
* (ECMA|Java)Script
* ```
*
* This is a sequence with the elements `(ECMA|Java)` and `Script`.
*/
class RegExpSequence extends RegExpTerm, @regexp_seq {
/** Gets an element of this sequence. */
RegExpTerm getElement() { result = this.getAChild() }
/** Gets the number of elements in this sequence. */
int getNumElement() { result = this.getNumChild() }
override predicate isNullable() {
forall(RegExpTerm child | child = this.getAChild() | child.isNullable())
}
override string getConstantValue() { result = this.getConstantValue(0) }
/**
* Gets the single string matched by the `i`th child and all following children of
* this sequence, if any.
*/
private string getConstantValue(int i) {
i = this.getNumChild() and
result = ""
or
result = this.getChild(i).getConstantValue() + this.getConstantValue(i + 1)
}
/** Gets the element preceding `element` in this sequence. */
RegExpTerm previousElement(RegExpTerm element) { element = this.nextElement(result) }
/** Gets the element following `element` in this sequence. */
RegExpTerm nextElement(RegExpTerm element) {
exists(int i |
element = this.getChild(i) and
result = this.getChild(i + 1)
)
}
override string getAPrimaryQlClass() { result = "RegExpSequence" }
}
/**
* A dollar `$` or caret assertion `^` matching the beginning or end of a line.
*
* Example:
*
* ```
* ^
* $
* ```
*/
class RegExpAnchor extends RegExpTerm, @regexp_anchor {
override predicate isNullable() { any() }
override string getAPrimaryQlClass() { result = "RegExpAnchor" }
/** Gets the char for this term. */
abstract string getChar();
}
/**
* A caret assertion `^` matching the beginning of a line.
*
* Example:
*
* ```
* ^
* ```
*/
class RegExpCaret extends RegExpAnchor, @regexp_caret {
override string getAPrimaryQlClass() { result = "RegExpCaret" }
override string getChar() { result = "^" }
}
/**
* A dollar assertion `$` matching the end of a line.
*
* Example:
*
* ```
* $
* ```
*/
class RegExpDollar extends RegExpAnchor, @regexp_dollar {
override string getAPrimaryQlClass() { result = "RegExpDollar" }
override string getChar() { result = "$" }
}
/**
* A word boundary assertion.
*
* Example:
*
* ```
* \b
* ```
*/
class RegExpWordBoundary extends RegExpTerm, @regexp_wordboundary {
override predicate isNullable() { any() }
override string getAPrimaryQlClass() { result = "RegExpWordBoundary" }
}
/**
* A non-word boundary assertion.
*
* Example:
*
* ```
* \B
* ```
*/
class RegExpNonWordBoundary extends RegExpTerm, @regexp_nonwordboundary {
override predicate isNullable() { any() }
override string getAPrimaryQlClass() { result = "RegExpNonWordBoundary" }
}
/**
* A zero-width lookahead or lookbehind assertion.
*
* Examples:
*
* ```
* (?=\w)
* (?!\n)
* (?<=\.)
* (?<!\\)
* ```
*/
class RegExpSubPattern extends RegExpTerm, @regexp_subpattern {
/** Gets the lookahead term. */
RegExpTerm getOperand() { result = this.getAChild() }
override predicate isNullable() { any() }
}
/**
* A zero-width lookahead assertion.
*
* Examples:
*
* ```
* (?=\w)
* (?!\n)
* ```
*/
class RegExpLookahead extends RegExpSubPattern, @regexp_lookahead {
override string getAPrimaryQlClass() { result = "RegExpLookahead" }
}
/**
* A zero-width lookbehind assertion.
*
* Examples:
*
* ```
* (?<=\.)
* (?<!\\)
* ```
*/
class RegExpLookbehind extends RegExpSubPattern, @regexp_lookbehind {
override string getAPrimaryQlClass() { result = "RegExpLookbehind" }
}
/**
* A positive-lookahead assertion.
*
* Examples:
*
* ```
* (?=\w)
* ```
*/
class RegExpPositiveLookahead extends RegExpLookahead, @regexp_positive_lookahead {
override string getAPrimaryQlClass() { result = "RegExpPositiveLookahead" }
}
/**
* A negative-lookahead assertion.
*
* Examples:
*
* ```
* (?!\n)
* ```
*/
class RegExpNegativeLookahead extends RegExpLookahead, @regexp_negative_lookahead {
override string getAPrimaryQlClass() { result = "RegExpNegativeLookahead" }
}
/**
* A positive-lookbehind assertion.
*
* Examples:
*
* ```
* (?<=\.)
* ```
*/
class RegExpPositiveLookbehind extends RegExpLookbehind, @regexp_positive_lookbehind {
override string getAPrimaryQlClass() { result = "RegExpPositiveLookbehind" }
}
/**
* A negative-lookbehind assertion.
*
* Examples:
*
* ```
* (?<!\\)
* ```
*/
class RegExpNegativeLookbehind extends RegExpLookbehind, @regexp_negative_lookbehind {
override string getAPrimaryQlClass() { result = "RegExpNegativeLookbehind" }
}
/**
* A star-quantified term.
*
* Example:
*
* ```
* \w*
* ```
*/
class RegExpStar extends RegExpQuantifier, @regexp_star {
override predicate isNullable() { any() }
override string getAPrimaryQlClass() { result = "RegExpStar" }
}
/**
* A plus-quantified term.
*
* Example:
*
* ```
* \w+
* ```
*/
class RegExpPlus extends RegExpQuantifier, @regexp_plus {
override predicate isNullable() { this.getAChild().isNullable() }
override string getAPrimaryQlClass() { result = "RegExpPlus" }
}
/**
* An optional term.
*
* Example:
*
* ```
* ;?
* ```
*/
class RegExpOpt extends RegExpQuantifier, @regexp_opt {
override predicate isNullable() { any() }
override string getAPrimaryQlClass() { result = "RegExpOpt" }
}
/**
* A range-quantified term
*
* Examples:
*
* ```
* \w{2,4}
* \w{2,}
* \w{2}
* ```
*/
class RegExpRange extends RegExpQuantifier, @regexp_range {
/** Gets the lower bound of the range. */
int getLowerBound() { range_quantifier_lower_bound(this, result) }
/**
* Gets the upper bound of the range, if any.
*
* If there is no upper bound, any number of repetitions is allowed.
* For a term of the form `r{lo}`, both the lower and the upper bound
* are `lo`.
*/
int getUpperBound() { range_quantifier_upper_bound(this, result) }
override predicate isNullable() {
this.getAChild().isNullable() or
this.getLowerBound() = 0
}
override string getAPrimaryQlClass() { result = "RegExpRange" }
}
/**
* A dot regular expression.
*
* Example:
*
* ```
* .
* ```
*/
class RegExpDot extends RegExpTerm, @regexp_dot {
override predicate isNullable() { none() }
override string getAPrimaryQlClass() { result = "RegExpDot" }
}
/**
* A grouped regular expression.
*
* Examples:
*
* ```
* (ECMA|Java)
* (?:ECMA|Java)
* (?<quote>['"])
* ```
*/
class RegExpGroup extends RegExpTerm, @regexp_group {
/** Holds if this is a capture group. */
predicate isCapture() { is_capture(this, _) }
/**
* Gets the index of this capture group within the enclosing regular
* expression literal.
*
* For example, in the regular expression `/((a?).)(?:b)/`, the
* group `((a?).)` has index 1, the group `(a?)` nested inside it
* has index 2, and the group `(?:b)` has no index, since it is
* not a capture group.
*/
int getNumber() { is_capture(this, result) }
/** Holds if this is a named capture group. */
predicate isNamed() { is_named_capture(this, _) }
/** Gets the name of this capture group, if any. */
string getName() { is_named_capture(this, result) }
override predicate isNullable() { this.getAChild().isNullable() }
override string getConstantValue() { result = this.getAChild().getConstantValue() }
override string getAMatchedString() { result = this.getAChild().getAMatchedString() }
override string getAPrimaryQlClass() { result = "RegExpGroup" }
}
/**
* A sequence of normal characters without special meaning in a regular expression.
*
* Example:
*
* ```
* abc
* ;
* ```
*/
class RegExpNormalConstant extends RegExpConstant, @regexp_normal_constant {
override string getAPrimaryQlClass() { result = "RegExpNormalConstant" }
}
/**
* A hexadecimal character escape in a regular expression.
*
* Example:
*
* ```
* \x0a
* ```
*/
class RegExpHexEscape extends RegExpCharEscape, @regexp_hex_escape {
override string getAPrimaryQlClass() { result = "RegExpHexEscape" }
}
/**
* A unicode character escape in a regular expression.
*
* Example:
*
* ```
* \u000a
* ```
*/
class RegExpUnicodeEscape extends RegExpCharEscape, @regexp_unicode_escape {
override string getAPrimaryQlClass() { result = "RegExpUnicodeEscape" }
}
/**
* A decimal character escape in a regular expression.
*
* Example:
*
* ```
* \0
* ```
*/
class RegExpDecimalEscape extends RegExpCharEscape, @regexp_dec_escape {
override string getAPrimaryQlClass() { result = "RegExpDecimalEscape" }
}
/**
* An octal character escape in a regular expression.
*
* Example:
*
* ```
* \0177
* ```
*/
class RegExpOctalEscape extends RegExpCharEscape, @regexp_oct_escape {
override string getAPrimaryQlClass() { result = "RegExpOctalEscape" }
}
/**
* A control character escape in a regular expression.
*
* Example:
*
* ```
* \ca
* ```
*/
class RegExpControlEscape extends RegExpCharEscape, @regexp_ctrl_escape {
override string getAPrimaryQlClass() { result = "RegExpControlEscape" }
}
/**
* A character class escape in a regular expression.
*
* Examples:
*
* ```
* \w
* \S
* ```
*/
class RegExpCharacterClassEscape extends RegExpEscape, @regexp_char_class_escape {
/** Gets the name of the character class; for example, `w` for `\w`. */
string getValue() { char_class_escape(this, result) }
override predicate isNullable() { none() }
override string getAPrimaryQlClass() { result = "RegExpCharacterClassEscape" }
}
/**
* A Unicode property escape in a regular expression.
*
* Examples:
*
* ```
* \p{Number}
* \p{Script=Greek}
* ```
*/
class RegExpUnicodePropertyEscape extends RegExpEscape, @regexp_unicode_property_escape {
/**
* Gets the name of this Unicode property; for example, `Number` for `\p{Number}` and
* `Script` for `\p{Script=Greek}`.
*/
string getName() { unicode_property_escapename(this, result) }
/**
* Gets the value of this Unicode property, if any.
*
* For example, the value of Unicode property `\p{Script=Greek}` is `Greek`, while
* `\p{Number}` does not have a value.
*/
string getValue() { unicode_property_escapevalue(this, result) }
override predicate isNullable() { none() }
override string getAPrimaryQlClass() { result = "RegExpUnicodePropertyEscape" }
}
/**
* An identity escape, that is, an escaped character in a regular expression that just
* represents itself.
*
* Examples:
*
* ```
* \\
* \/
* ```
*/
class RegExpIdentityEscape extends RegExpCharEscape, @regexp_id_escape {
override string getAPrimaryQlClass() { result = "RegExpIdentityEscape" }
}
/**
* A back reference, that is, a term of the form `\i` or `\k<name>`
* in a regular expression.
*
* Examples:
*
* ```
* \1
* \k<quote>
* ```
*/
class RegExpBackRef extends RegExpTerm, @regexp_backref {
/**
* Gets the number of the capture group this back reference refers to, if any.
*/
int getNumber() { backref(this, result) }
/**
* Gets the name of the capture group this back reference refers to, if any.
*/
string getName() { named_backref(this, result) }
/** Gets the capture group this back reference refers to. */
RegExpGroup getGroup() {
result.getLiteral() = this.getLiteral() and
(
result.getNumber() = this.getNumber() or
result.getName() = this.getName()
)
}
override predicate isNullable() { this.getGroup().isNullable() }
override string getAPrimaryQlClass() { result = "RegExpBackRef" }
}
/**
* A character class in a regular expression.
*
* Examples:
*
* ```
* [a-z_]
* [^<>&]
* ```
*/
class RegExpCharacterClass extends RegExpTerm, @regexp_char_class {
/** Holds if this is an inverted character class, that is, a term of the form `[^...]`. */
predicate isInverted() { is_inverted(this) }
override predicate isNullable() { none() }
override string getAMatchedString() {
not this.isInverted() and result = this.getAChild().getAMatchedString()
}
/**
* Holds if this character class matches any character.
*/
predicate isUniversalClass() {
// [^]
this.isInverted() and not exists(this.getAChild())
or
// [\w\W] and similar
not this.isInverted() and
exists(string cce1, string cce2 |
cce1 = this.getAChild().(RegExpCharacterClassEscape).getValue() and
cce2 = this.getAChild().(RegExpCharacterClassEscape).getValue()
|
cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase()
)
}
override string getAPrimaryQlClass() { result = "RegExpCharacterClass" }
}
/**
* A character range in a character class in a regular expression.
*
* Example:
*
* ```
* a-z
* ```
*/
class RegExpCharacterRange extends RegExpTerm, @regexp_char_range {
override predicate isNullable() { none() }
/** Holds if `lo` is the lower bound of this character range and `hi` the upper bound. */
predicate isRange(string lo, string hi) {
lo = this.getChild(0).(RegExpConstant).getValue() and
hi = this.getChild(1).(RegExpConstant).getValue()
}
override string getAPrimaryQlClass() { result = "RegExpCharacterRange" }
}
/** A parse error encountered while processing a regular expression literal. */
class RegExpParseError extends Error, @regexp_parse_error {
/** Gets the regular expression term that triggered the parse error. */
RegExpTerm getTerm() { regexp_parse_errors(this, result, _) }
/** Gets the regular expression literal in which the parse error occurred. */
RegExpLiteral getLiteral() { result = this.getTerm().getLiteral() }
override string getMessage() { regexp_parse_errors(this, _, result) }
override string toString() { result = this.getMessage() }
override predicate isFatal() { none() }
}
/**
* Holds if `func` is a method defined on `String.prototype` with name `name`.
*/
overlay[global]
private predicate isNativeStringMethod(Function func, string name) {
exists(ExternalInstanceMemberDecl decl |
decl.hasQualifiedName("String", name) and
func = decl.getInit()
)
}
/**
* Holds if `name` is the name of a property on a Match object returned by `String.prototype.match`,
* not including array indices.
*/
overlay[global]
private predicate isMatchObjectProperty(string name) {
any(ExternalInstanceMemberDecl decl).hasQualifiedName("Array", name)
or
name in ["length", "index", "input", "groups"]
}
/** Holds if `call` is a call to `match` whose result is used in a way that is incompatible with Match objects. */
overlay[global]
private predicate isUsedAsNonMatchObject(DataFlow::MethodCallNode call) {
call.getMethodName() = ["match", "matchAll"] and
call.getNumArgument() = 1 and
(
// Accessing a property that is absent on Match objects
exists(string propName |
exists(call.getAPropertyRead(propName)) and
not isMatchObjectProperty(propName) and
not exists(propName.toInt())
)
or
// Awaiting the result
call.flowsToExpr(any(AwaitExpr await).getOperand())
or
// Result is obviously unused
call.asExpr() = any(ExprStmt stmt).getExpr()
or
call = API::moduleImport("sinon").getMember("match").getACall()
)
}
/**
* Holds if `value` is used in a way that suggests it returns a number.
*/
overlay[global]
pragma[inline]
private predicate isUsedAsNumber(DataFlow::LocalSourceNode value) {
any(Comparison compare)
.hasOperands(value.getALocalUse().asExpr(), any(Expr e | canBeNumber(e.analyze())))
or
value.flowsToExpr(any(ArithmeticExpr e).getAnOperand())
or
value.flowsToExpr(any(UnaryExpr e | e.getOperator() = "-").getOperand())
or
value.flowsToExpr(any(IndexExpr expr).getPropertyNameExpr())
or
exists(DataFlow::CallNode call |
call.getCalleeName() =
["substring", "substr", "slice", "splice", "charAt", "charCodeAt", "codePointAt", "toSpliced"] and
value.flowsTo(call.getAnArgument())
)
}
bindingset[node]
overlay[global]
pragma[inline_late]
private predicate canBeString(DataFlow::AnalyzedNode node) { node.getAType() = TTString() }
bindingset[node]
overlay[global]
pragma[inline_late]
private predicate canBeNumber(DataFlow::AnalyzedNode node) { node.getAType() = TTNumber() }
/**
* Holds if `source` may be interpreted as a regular expression.
*/
overlay[global]
cached
predicate isInterpretedAsRegExp(DataFlow::Node source) {
Stages::Taint::ref() and
canBeString(source) and
(
// The first argument to an invocation of `RegExp` (with or without `new`).
source = DataFlow::globalVarRef("RegExp").getAnInvocation().getArgument(0)
or
// The argument of a call that coerces the argument to a regular expression.
exists(DataFlow::MethodCallNode mce, string methodName |
canBeString(mce.getReceiver()) and
mce.getMethodName() = methodName and
not exists(Function func | func = mce.getACallee() |
not isNativeStringMethod(func, methodName)
)
|
methodName = ["match", "matchAll"] and
source = mce.getArgument(0) and
mce.getNumArgument() = 1 and
not isUsedAsNonMatchObject(mce)
or
methodName = "search" and
source = mce.getArgument(0) and
mce.getNumArgument() = 1 and
// "search" is a common method name, and the built-in "search" method is rarely used,
// so to reduce FPs we also require that the return value appears to be used as a number.
isUsedAsNumber(mce)
)
or
exists(DataFlow::SourceNode schema | schema = JsonSchema::getAPartOfJsonSchema() |
source = schema.getAPropertyWrite("pattern").getRhs()
or
source =
schema
.getAPropertySource("patternProperties")
.getAPropertyWrite()
.getPropertyNameExpr()
.flow()
)
)
}
/**
* Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted
* as a part of a regular expression.
*/
overlay[global]
private DataFlow::Node regExpSource(DataFlow::Node re, DataFlow::TypeBackTracker t) {
t.start() and
re = result and
isInterpretedAsRegExp(result)
or
exists(DataFlow::TypeBackTracker t2, DataFlow::Node succ | succ = regExpSource(re, t2) |
t2 = t.smallstep(result, succ)
or
TaintTracking::sharedTaintStep(result, succ) and
t = t2
)
}
/**
* Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted
* as a part of a regular expression.
*/
overlay[global]
private DataFlow::Node regExpSource(DataFlow::Node re) {
result = regExpSource(re, DataFlow::TypeBackTracker::end())
}
/**
* A node whose value may flow to a position where it is interpreted
* as a part of a regular expression.
*/
overlay[global]
abstract class RegExpPatternSource extends DataFlow::Node {
/**
* Gets a node where the pattern of this node is parsed as a part of
* a regular expression.
*/
abstract DataFlow::Node getAParse();
/**
* Gets the pattern of this node that is interpreted as a part of a
* regular expression.
*/
abstract string getPattern();
/**
* Gets a regular expression object that is constructed from the pattern
* of this node.
*/
abstract DataFlow::SourceNode getARegExpObject();
/**
* Gets the root term of the regular expression parsed from this pattern.
*/
abstract RegExpTerm getRegExpTerm();
}
/**
* A regular expression literal, viewed as the pattern source for itself.
*/
overlay[global]
private class RegExpLiteralPatternSource extends RegExpPatternSource, DataFlow::ValueNode {
override RegExpLiteral astNode;
override DataFlow::Node getAParse() { result = this }
override string getPattern() {
// hide the fact that `/` is escaped in the literal
result = astNode.getRoot().getRawValue().regexpReplaceAll("\\\\/", "/")
}
override DataFlow::SourceNode getARegExpObject() { result = this }
override RegExpTerm getRegExpTerm() { result = astNode.getRoot() }
}
/**
* A node whose string value may flow to a position where it is interpreted
* as a part of a regular expression.
*/
overlay[global]
private class StringRegExpPatternSource extends RegExpPatternSource {
DataFlow::Node parse;
StringRegExpPatternSource() { this = regExpSource(parse) }
override DataFlow::Node getAParse() { result = parse }
override DataFlow::SourceNode getARegExpObject() {
exists(DataFlow::InvokeNode constructor |
constructor = DataFlow::globalVarRef("RegExp").getAnInvocation() and
parse = constructor.getArgument(0) and
result = constructor
)
}
override string getPattern() { result = this.getStringValue() }
override RegExpTerm getRegExpTerm() { result = this.asExpr().(StringLiteral).asRegExp() }
}
/**
* A node whose string value may flow to a position where it is interpreted
* as a part of a regular expression.
*/
overlay[global]
private class StringConcatRegExpPatternSource extends RegExpPatternSource {
DataFlow::Node parse;
StringConcatRegExpPatternSource() { this = regExpSource(parse) }
override DataFlow::Node getAParse() { result = parse }
override DataFlow::SourceNode getARegExpObject() {
exists(DataFlow::InvokeNode constructor |
constructor = DataFlow::globalVarRef("RegExp").getAnInvocation() and
parse = constructor.getArgument(0) and
result = constructor
)
}
override string getPattern() { result = this.getStringValue() }
override RegExpTerm getRegExpTerm() { result = this.asExpr().(AddExpr).asRegExp() }
}
/**
* A quoted string escape in a regular expression, using the `\q` syntax.
* The only operation supported inside a quoted string is alternation, using `|`.
*
* Example:
*
* ```
* \q{foo}
* \q{a|b|c}
* ```
*/
class RegExpQuotedString extends RegExpTerm, @regexp_quoted_string {
/** Gets the term representing the contents of this quoted string. */
RegExpTerm getTerm() { result = this.getAChild() }
override predicate isNullable() { none() }
override string getAMatchedString() { result = this.getTerm().getAMatchedString() }
override string getAPrimaryQlClass() { result = "RegExpQuotedString" }
}
module RegExp {
/** Gets the string `"?"` used to represent a regular expression whose flags are unknown. */
string unknownFlag() { result = "?" }
/** Holds if `flags` includes the `m` flag. */
bindingset[flags]
predicate isMultiline(string flags) { flags.matches("%m%") }
/** Holds if `flags` includes the `g` flag. */
bindingset[flags]
predicate isGlobal(string flags) { flags.matches("%g%") }
/** Holds if `flags` includes the `i` flag. */
bindingset[flags]
predicate isIgnoreCase(string flags) { flags.matches("%i%") }
/** Holds if `flags` includes the `s` flag. */
bindingset[flags]
predicate isDotAll(string flags) { flags.matches("%s%") }
/** Holds if `flags` includes the `v` flag. */
bindingset[flags]
predicate isUnicodeSets(string flags) { flags.matches("%v%") }
/** Holds if `flags` includes the `m` flag or is the unknown flag `?`. */
bindingset[flags]
predicate maybeMultiline(string flags) { flags = unknownFlag() or isMultiline(flags) }
/** Holds if `flags` includes the `g` flag or is the unknown flag `?`. */
bindingset[flags]
predicate maybeGlobal(string flags) { flags = unknownFlag() or isGlobal(flags) }
/** Holds if `flags` includes the `i` flag or is the unknown flag `?`. */
bindingset[flags]
predicate maybeIgnoreCase(string flags) { flags = unknownFlag() or isIgnoreCase(flags) }
/** Holds if `flags` includes the `s` flag or is the unknown flag `?`. */
bindingset[flags]
predicate maybeDotAll(string flags) { flags = unknownFlag() or isDotAll(flags) }
/** Holds if `term` and all of its disjuncts are anchored on both ends. */
predicate isFullyAnchoredTerm(RegExpTerm term) {
exists(RegExpSequence seq | term = seq |
seq.getChild(0) instanceof RegExpCaret and
seq.getLastChild() instanceof RegExpDollar
)
or
isFullyAnchoredTerm(term.(RegExpGroup).getAChild())
or
isFullyAnchoredAlt(term, term.getNumChild())
}
/** Holds if the first `i` disjuncts of `term` are fully anchored. */
private predicate isFullyAnchoredAlt(RegExpAlt term, int i) {
isFullyAnchoredTerm(term.getChild(0)) and i = 1
or
isFullyAnchoredAlt(term, i - 1) and
isFullyAnchoredTerm(term.getChild(i - 1))
}
/**
* Holds if `term` matches any character except for explicitly listed exceptions.
*
* For example, holds for `.`, `[^<>]`, or `\W`, but not for `[a-z]`, `\w`, or `[^\W\S]`.
*/
predicate isWildcardLike(RegExpTerm term) {
term instanceof RegExpDot
or
term.(RegExpCharacterClassEscape).getValue().isUppercase()
or
// [^a-z]
exists(RegExpCharacterClass cls | term = cls |
cls.isInverted() and
not cls.getAChild().(RegExpCharacterClassEscape).getValue().isUppercase()
)
or
// [\W]
exists(RegExpCharacterClass cls | term = cls |
not cls.isInverted() and
cls.getAChild().(RegExpCharacterClassEscape).getValue().isUppercase()
)
or
// an unlimited number of wildcards, is also a wildcard.
exists(InfiniteRepetitionQuantifier q |
term = q and
isWildcardLike(q.getAChild())
)
}
/**
* Holds if `term` is a generic sanitizer for strings that match (if `outcome` is true)
* or strings that don't match (if `outcome` is false).
*
* Specifically, whitelisting regexps such as `^(foo|bar)$` sanitize matches in the true case.
* Inverted character classes such as `[^a-z]` or `\W` sanitize matches in the false case.
*/
predicate isGenericRegExpSanitizer(RegExpTerm term, boolean outcome) {
term.isRootTerm() and
(
outcome = true and
isFullyAnchoredTerm(term) and
not isWildcardLike(term.getAChild*())
or
// Character set restrictions like `/[^a-z]/.test(x)` sanitize in the false case
outcome = false and
exists(RegExpTerm root |
root = term
or
root = term.(RegExpGroup).getAChild()
|
isWildcardLike(root)
or
isWildcardLike(root.(RegExpAlt).getAChild())
)
)
}
/**
* Gets the AST of a regular expression object that can flow to `node`.
*/
overlay[global]
RegExpTerm getRegExpObjectFromNode(DataFlow::Node node) {
exists(DataFlow::RegExpCreationNode regexp |
regexp.getAReference().flowsTo(node) and
result = regexp.getRoot()
)
}
/**
* Gets the AST of a regular expression that can flow to `node`,
* including `RegExp` objects as well as strings interpreted as regular expressions.
*/
overlay[global]
RegExpTerm getRegExpFromNode(DataFlow::Node node) {
result = getRegExpObjectFromNode(node)
or
result = node.asExpr().(StringLiteral).asRegExp()
}
/**
* A character that will be analyzed by `RegExp::alwaysMatchesMetaCharacter`.
*
* Currently only `<`, `'`, and `"` are considered to be meta-characters, but new meta-characters
* can be added by subclassing this class.
*/
abstract class MetaCharacter extends string {
bindingset[this]
MetaCharacter() { any() }
/**
* Holds if the given atomic term matches this meta-character.
*
* Does not hold for derived terms like alternatives and groups.
*
* By default, `.`, `\W`, `\S`, and `\D` are considered to match any meta-character,
* but the predicate can be overridden for meta-characters where this is not the case.
*/
predicate matchedByAtom(RegExpTerm term) {
term.(RegExpConstant).getConstantValue() = this
or
term instanceof RegExpDot
or
term.(RegExpCharacterClassEscape).getValue() = ["\\W", "\\S", "\\D"]
or
exists(string lo, string hi |
term.(RegExpCharacterRange).isRange(lo, hi) and
lo <= this and
this <= hi
)
}
}
/**
* A meta character used by HTML.
*/
private class HtmlMetaCharacter extends MetaCharacter {
HtmlMetaCharacter() { this = ["<", "'", "\""] }
}
/**
* A meta character used by regular expressions.
*/
private class RegexpMetaChars extends RegExp::MetaCharacter {
RegexpMetaChars() { this = ["{", "[", "+"] }
}
/**
* Holds if `term` can match any occurrence of `char` within a string (not taking into account
* the context in which `term` appears).
*
* This predicate is under-approximate and never considers sequences to guarantee a match.
*/
predicate alwaysMatchesMetaCharacter(RegExpTerm term, MetaCharacter char) {
not term.getParent() instanceof RegExpSequence and // restrict size of predicate
char.matchedByAtom(term)
or
alwaysMatchesMetaCharacter(term.(RegExpGroup).getAChild(), char)
or
alwaysMatchesMetaCharacter(term.(RegExpAlt).getAlternative(), char)
or
exists(RegExpCharacterClass class_ | term = class_ |
not class_.isInverted() and
char.matchedByAtom(class_.getAChild())
or
class_.isInverted() and
not char.matchedByAtom(class_.getAChild())
)
}
}