/** * Provides classes for working with regular expressions. * * Regular expression literals are represented as an abstract syntax tree of regular expression * terms. */ overlay[local] module; import javascript private import semmle.javascript.dataflow.InferredTypes private import semmle.javascript.internal.CachedStages /** * An element containing a regular expression term, that is, either * a regular expression literal, a string literal (parsed as a regular expression), * or another regular expression term. * * Examples: * * ``` * // the regular expression literal and all terms it contains are regexp parents * /((ECMA|Java)[sS]cript)*$/ * ``` */ class RegExpParent extends Locatable, @regexpparent { } /** * A regular expression term, that is, a syntactic part of a regular expression. * * Regular expression terms may occur as part of a regular expression literal, * such as `/[a-z]+/`, or as part of a string literal, such as `"[a-z]+"`. * * Note that some terms will occur as part of a string literal that isn't actually * interpreted as regular expression at runtime. Use `isPartOfRegExpLiteral` * or `isUsedAsRegExp` to check if a term is really used as a regular expression. * * Examples: * * ``` * ((ECMA|Java)[sS]cript)*$ * ((ECMA|Java)[sS]cript)* * (ECMA|Java) * $ * ``` */ class RegExpTerm extends Locatable, @regexpterm { /** Gets the `i`th child term of this term. */ RegExpTerm getChild(int i) { regexpterm(result, _, this, i, _) } /** Gets a child term of this term. */ RegExpTerm getAChild() { result = this.getChild(_) } /** Gets the number of child terms of this term. */ int getNumChild() { result = count(this.getAChild()) } /** Gets the last child term of this term. */ RegExpTerm getLastChild() { result = this.getChild(this.getNumChild() - 1) } /** * Gets the parent term of this regular expression term, or the * regular expression literal if this is the root term. */ RegExpParent getParent() { regexpterm(this, _, result, _, _) } /** Gets the regular expression literal this term belongs to, if any. */ RegExpLiteral getLiteral() { result = this.getRootTerm().getParent() } override string toString() { regexpterm(this, _, _, _, result) } /** Gets the raw source text of this term. */ string getRawValue() { regexpterm(this, _, _, _, result) } /** Holds if this regular expression term can match the empty string. */ predicate isNullable() { none() } // Overridden in subclasses. /** Gets the regular expression term that is matched (textually) before this one, if any. */ RegExpTerm getPredecessor() { exists(RegExpTerm parent | parent = this.getParent() | result = parent.(RegExpSequence).previousElement(this) or not exists(parent.(RegExpSequence).previousElement(this)) and not parent instanceof RegExpSubPattern and result = parent.getPredecessor() ) } /** Gets the regular expression term that is matched (textually) after this one, if any. */ RegExpTerm getSuccessor() { exists(RegExpTerm parent | parent = this.getParent() | result = parent.(RegExpSequence).nextElement(this) or not exists(parent.(RegExpSequence).nextElement(this)) and not parent instanceof RegExpSubPattern and result = parent.getSuccessor() ) } /** * Holds if this regular term is in a forward-matching context, that is, * it has no enclosing lookbehind assertions. */ predicate isInForwardMatchingContext() { not this.isInBackwardMatchingContext() } /** * Holds if this regular term is in a backward-matching context, that is, * it has an enclosing lookbehind assertions. */ predicate isInBackwardMatchingContext() { this = any(RegExpLookbehind lbh).getAChild+() } /** * Holds if this is the root term of a regular expression. */ predicate isRootTerm() { not this.getParent() instanceof RegExpTerm } /** * Gets the outermost term of this regular expression. */ RegExpTerm getRootTerm() { this.isRootTerm() and result = this or result = this.getParent().(RegExpTerm).getRootTerm() } /** * Holds if this term occurs as part of a regular expression literal. */ predicate isPartOfRegExpLiteral() { exists(this.getLiteral()) } /** * Holds if this term occurs as part of a string literal. * * This predicate holds regardless of whether the string literal is actually * used as a regular expression. See `isUsedAsRegExp`. */ predicate isPartOfStringLiteral() { this.getRootTerm().getParent() instanceof StringLiteral } /** * Holds if this term is part of a regular expression literal, or a string literal * that is interpreted as a regular expression. * * Unlike `isPartOfRegExpLiteral` and `isPartOfStringLiteral`, this predicate takes * data flow into account, to exclude string literals that aren't used as regular expressions. * * For example: * ```js * location.href.match("^https://example\\.com/") // YES - String is used as regexpp * * console.log("Hello world"); // NO - string is not used as regexp * * /[a-z]+/g; // YES - Regexp literals are always used as regexp * ``` */ overlay[global] predicate isUsedAsRegExp() { exists(RegExpParent parent | parent = this.getRootTerm().getParent() | parent instanceof RegExpLiteral or parent.(Expr).flow() instanceof RegExpPatternSource ) } /** * Gets the single string this regular-expression term matches. * * This predicate is only defined for (sequences/groups of) constant regular expressions. * In particular, terms involving zero-width assertions like `^` or `\b` are not considered * to have a constant value. * * Note that this predicate does not take flags of the enclosing regular-expression literal * into account. */ string getConstantValue() { none() } /** * Gets a string that is matched by this regular-expression term. */ string getAMatchedString() { result = this.getConstantValue() } /** Holds if this term has the specified location. */ predicate hasLocationInfo( string filepath, int startline, int startcolumn, int endline, int endcolumn ) { this.getLocation().hasLocationInfo(filepath, startline, startcolumn, endline, endcolumn) } } /** * A quantified regular expression term. * * Example: * * ``` * ((ECMA|Java)[sS]cript)* * ``` */ class RegExpQuantifier extends RegExpTerm, @regexp_quantifier { /** Holds if the quantifier of this term is a greedy quantifier. */ predicate isGreedy() { is_greedy(this) } } /** * A regular expression term that permits unlimited repetitions. */ class InfiniteRepetitionQuantifier extends RegExpQuantifier { InfiniteRepetitionQuantifier() { this instanceof RegExpPlus or this instanceof RegExpStar or this instanceof RegExpRange and not exists(this.(RegExpRange).getUpperBound()) } } /** * An escaped regular expression term, that is, a regular expression * term starting with a backslash. * * Example: * * ``` * \. * \w * ``` */ class RegExpEscape extends RegExpTerm, @regexp_escape { override string getAPrimaryQlClass() { result = "RegExpEscape" } } /** * A constant regular expression term, that is, a regular expression * term matching a single string. * * Example: * * ``` * abc * ``` */ class RegExpConstant extends RegExpTerm, @regexp_constant { /** Gets the string matched by this constant term. */ string getValue() { regexp_const_value(this, result) } /** * Holds if this constant represents a valid Unicode character (as opposed * to a surrogate code point that does not correspond to a character by itself.) */ predicate isCharacter() { any() } override predicate isNullable() { none() } override string getConstantValue() { result = this.getValue() } override string getAPrimaryQlClass() { result = "RegExpConstant" } } /** * A character escape in a regular expression. * * Example: * * ``` * \. * ``` */ class RegExpCharEscape extends RegExpEscape, RegExpConstant, @regexp_char_escape { override predicate isCharacter() { not ( // unencodable characters are represented as '?' or \uFFFD in the database this.getValue() = ["?", 65533.toUnicode()] and exists(string s | s = this.toString().toLowerCase() | // only Unicode escapes give rise to unencodable characters s.matches("\\\\u%") and // but '\u003f' actually is the '?' character itself s != "\\u003f" ) ) } override string getAPrimaryQlClass() { result = "RegExpCharEscape" } } /** * An alternative term, that is, a term of the form `a|b`. * * Example: * * ``` * ECMA|Java * ``` */ class RegExpAlt extends RegExpTerm, @regexp_alt { /** Gets an alternative of this term. */ RegExpTerm getAlternative() { result = this.getAChild() } /** Gets the number of alternatives of this term. */ int getNumAlternative() { result = this.getNumChild() } override predicate isNullable() { this.getAlternative().isNullable() } override string getAMatchedString() { result = this.getAlternative().getAMatchedString() } override string getAPrimaryQlClass() { result = "RegExpAlt" } } /** * An intersection term, that is, a term of the form `[[a]&&[ab]]`. * * Example: * * ``` * /[[abc]&&[bcd]]/v - which matches 'b' and 'c' only. * ``` */ class RegExpIntersection extends RegExpTerm, @regexp_intersection { /** Gets an intersected term of this term. */ RegExpTerm getAnElement() { result = this.getAChild() } /** Gets the number of intersected terms of this term. */ int getNumIntersectedTerm() { result = this.getNumChild() } override predicate isNullable() { this.getAnElement().isNullable() } override string getAPrimaryQlClass() { result = "RegExpIntersection" } } /** * A subtraction term, that is, a term of the form `[[a]--[ab]]`. * * Example: * * ``` * /[[abc]--[bc]]/v - which matches 'a' only. * ``` */ class RegExpSubtraction extends RegExpTerm, @regexp_subtraction { /** Gets the minuend (left operand) of this subtraction. */ RegExpTerm getFirstTerm() { result = this.getChild(0) } /** Gets the number of subtractions terms of this term. */ int getNumSubtractedTerm() { result = this.getNumChild() - 1 } /** Gets a subtrahend (right operand) of this subtraction. */ RegExpTerm getASubtractedTerm() { exists(int i | i > 0 and result = this.getChild(i)) } override predicate isNullable() { none() } override string getAPrimaryQlClass() { result = "RegExpSubtraction" } } /** * A sequence term. * * Example: * * ``` * (ECMA|Java)Script * ``` * * This is a sequence with the elements `(ECMA|Java)` and `Script`. */ class RegExpSequence extends RegExpTerm, @regexp_seq { /** Gets an element of this sequence. */ RegExpTerm getElement() { result = this.getAChild() } /** Gets the number of elements in this sequence. */ int getNumElement() { result = this.getNumChild() } override predicate isNullable() { forall(RegExpTerm child | child = this.getAChild() | child.isNullable()) } override string getConstantValue() { result = this.getConstantValue(0) } /** * Gets the single string matched by the `i`th child and all following children of * this sequence, if any. */ private string getConstantValue(int i) { i = this.getNumChild() and result = "" or result = this.getChild(i).getConstantValue() + this.getConstantValue(i + 1) } /** Gets the element preceding `element` in this sequence. */ RegExpTerm previousElement(RegExpTerm element) { element = this.nextElement(result) } /** Gets the element following `element` in this sequence. */ RegExpTerm nextElement(RegExpTerm element) { exists(int i | element = this.getChild(i) and result = this.getChild(i + 1) ) } override string getAPrimaryQlClass() { result = "RegExpSequence" } } /** * A dollar `$` or caret assertion `^` matching the beginning or end of a line. * * Example: * * ``` * ^ * $ * ``` */ class RegExpAnchor extends RegExpTerm, @regexp_anchor { override predicate isNullable() { any() } override string getAPrimaryQlClass() { result = "RegExpAnchor" } /** Gets the char for this term. */ abstract string getChar(); } /** * A caret assertion `^` matching the beginning of a line. * * Example: * * ``` * ^ * ``` */ class RegExpCaret extends RegExpAnchor, @regexp_caret { override string getAPrimaryQlClass() { result = "RegExpCaret" } override string getChar() { result = "^" } } /** * A dollar assertion `$` matching the end of a line. * * Example: * * ``` * $ * ``` */ class RegExpDollar extends RegExpAnchor, @regexp_dollar { override string getAPrimaryQlClass() { result = "RegExpDollar" } override string getChar() { result = "$" } } /** * A word boundary assertion. * * Example: * * ``` * \b * ``` */ class RegExpWordBoundary extends RegExpTerm, @regexp_wordboundary { override predicate isNullable() { any() } override string getAPrimaryQlClass() { result = "RegExpWordBoundary" } } /** * A non-word boundary assertion. * * Example: * * ``` * \B * ``` */ class RegExpNonWordBoundary extends RegExpTerm, @regexp_nonwordboundary { override predicate isNullable() { any() } override string getAPrimaryQlClass() { result = "RegExpNonWordBoundary" } } /** * A zero-width lookahead or lookbehind assertion. * * Examples: * * ``` * (?=\w) * (?!\n) * (?<=\.) * (?['"]) * ``` */ class RegExpGroup extends RegExpTerm, @regexp_group { /** Holds if this is a capture group. */ predicate isCapture() { is_capture(this, _) } /** * Gets the index of this capture group within the enclosing regular * expression literal. * * For example, in the regular expression `/((a?).)(?:b)/`, the * group `((a?).)` has index 1, the group `(a?)` nested inside it * has index 2, and the group `(?:b)` has no index, since it is * not a capture group. */ int getNumber() { is_capture(this, result) } /** Holds if this is a named capture group. */ predicate isNamed() { is_named_capture(this, _) } /** Gets the name of this capture group, if any. */ string getName() { is_named_capture(this, result) } override predicate isNullable() { this.getAChild().isNullable() } override string getConstantValue() { result = this.getAChild().getConstantValue() } override string getAMatchedString() { result = this.getAChild().getAMatchedString() } override string getAPrimaryQlClass() { result = "RegExpGroup" } } /** * A sequence of normal characters without special meaning in a regular expression. * * Example: * * ``` * abc * ; * ``` */ class RegExpNormalConstant extends RegExpConstant, @regexp_normal_constant { override string getAPrimaryQlClass() { result = "RegExpNormalConstant" } } /** * A hexadecimal character escape in a regular expression. * * Example: * * ``` * \x0a * ``` */ class RegExpHexEscape extends RegExpCharEscape, @regexp_hex_escape { override string getAPrimaryQlClass() { result = "RegExpHexEscape" } } /** * A unicode character escape in a regular expression. * * Example: * * ``` * \u000a * ``` */ class RegExpUnicodeEscape extends RegExpCharEscape, @regexp_unicode_escape { override string getAPrimaryQlClass() { result = "RegExpUnicodeEscape" } } /** * A decimal character escape in a regular expression. * * Example: * * ``` * \0 * ``` */ class RegExpDecimalEscape extends RegExpCharEscape, @regexp_dec_escape { override string getAPrimaryQlClass() { result = "RegExpDecimalEscape" } } /** * An octal character escape in a regular expression. * * Example: * * ``` * \0177 * ``` */ class RegExpOctalEscape extends RegExpCharEscape, @regexp_oct_escape { override string getAPrimaryQlClass() { result = "RegExpOctalEscape" } } /** * A control character escape in a regular expression. * * Example: * * ``` * \ca * ``` */ class RegExpControlEscape extends RegExpCharEscape, @regexp_ctrl_escape { override string getAPrimaryQlClass() { result = "RegExpControlEscape" } } /** * A character class escape in a regular expression. * * Examples: * * ``` * \w * \S * ``` */ class RegExpCharacterClassEscape extends RegExpEscape, @regexp_char_class_escape { /** Gets the name of the character class; for example, `w` for `\w`. */ string getValue() { char_class_escape(this, result) } override predicate isNullable() { none() } override string getAPrimaryQlClass() { result = "RegExpCharacterClassEscape" } } /** * A Unicode property escape in a regular expression. * * Examples: * * ``` * \p{Number} * \p{Script=Greek} * ``` */ class RegExpUnicodePropertyEscape extends RegExpEscape, @regexp_unicode_property_escape { /** * Gets the name of this Unicode property; for example, `Number` for `\p{Number}` and * `Script` for `\p{Script=Greek}`. */ string getName() { unicode_property_escapename(this, result) } /** * Gets the value of this Unicode property, if any. * * For example, the value of Unicode property `\p{Script=Greek}` is `Greek`, while * `\p{Number}` does not have a value. */ string getValue() { unicode_property_escapevalue(this, result) } override predicate isNullable() { none() } override string getAPrimaryQlClass() { result = "RegExpUnicodePropertyEscape" } } /** * An identity escape, that is, an escaped character in a regular expression that just * represents itself. * * Examples: * * ``` * \\ * \/ * ``` */ class RegExpIdentityEscape extends RegExpCharEscape, @regexp_id_escape { override string getAPrimaryQlClass() { result = "RegExpIdentityEscape" } } /** * A back reference, that is, a term of the form `\i` or `\k` * in a regular expression. * * Examples: * * ``` * \1 * \k * ``` */ class RegExpBackRef extends RegExpTerm, @regexp_backref { /** * Gets the number of the capture group this back reference refers to, if any. */ int getNumber() { backref(this, result) } /** * Gets the name of the capture group this back reference refers to, if any. */ string getName() { named_backref(this, result) } /** Gets the capture group this back reference refers to. */ RegExpGroup getGroup() { result.getLiteral() = this.getLiteral() and ( result.getNumber() = this.getNumber() or result.getName() = this.getName() ) } override predicate isNullable() { this.getGroup().isNullable() } override string getAPrimaryQlClass() { result = "RegExpBackRef" } } /** * A character class in a regular expression. * * Examples: * * ``` * [a-z_] * [^<>&] * ``` */ class RegExpCharacterClass extends RegExpTerm, @regexp_char_class { /** Holds if this is an inverted character class, that is, a term of the form `[^...]`. */ predicate isInverted() { is_inverted(this) } override predicate isNullable() { none() } override string getAMatchedString() { not this.isInverted() and result = this.getAChild().getAMatchedString() } /** * Holds if this character class matches any character. */ predicate isUniversalClass() { // [^] this.isInverted() and not exists(this.getAChild()) or // [\w\W] and similar not this.isInverted() and exists(string cce1, string cce2 | cce1 = this.getAChild().(RegExpCharacterClassEscape).getValue() and cce2 = this.getAChild().(RegExpCharacterClassEscape).getValue() | cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase() ) } override string getAPrimaryQlClass() { result = "RegExpCharacterClass" } } /** * A character range in a character class in a regular expression. * * Example: * * ``` * a-z * ``` */ class RegExpCharacterRange extends RegExpTerm, @regexp_char_range { override predicate isNullable() { none() } /** Holds if `lo` is the lower bound of this character range and `hi` the upper bound. */ predicate isRange(string lo, string hi) { lo = this.getChild(0).(RegExpConstant).getValue() and hi = this.getChild(1).(RegExpConstant).getValue() } override string getAPrimaryQlClass() { result = "RegExpCharacterRange" } } /** A parse error encountered while processing a regular expression literal. */ class RegExpParseError extends Error, @regexp_parse_error { /** Gets the regular expression term that triggered the parse error. */ RegExpTerm getTerm() { regexp_parse_errors(this, result, _) } /** Gets the regular expression literal in which the parse error occurred. */ RegExpLiteral getLiteral() { result = this.getTerm().getLiteral() } override string getMessage() { regexp_parse_errors(this, _, result) } override string toString() { result = this.getMessage() } override predicate isFatal() { none() } } /** * Holds if `func` is a method defined on `String.prototype` with name `name`. */ overlay[global] private predicate isNativeStringMethod(Function func, string name) { exists(ExternalInstanceMemberDecl decl | decl.hasQualifiedName("String", name) and func = decl.getInit() ) } /** * Holds if `name` is the name of a property on a Match object returned by `String.prototype.match`, * not including array indices. */ overlay[global] private predicate isMatchObjectProperty(string name) { any(ExternalInstanceMemberDecl decl).hasQualifiedName("Array", name) or name in ["length", "index", "input", "groups"] } /** Holds if `call` is a call to `match` whose result is used in a way that is incompatible with Match objects. */ overlay[global] private predicate isUsedAsNonMatchObject(DataFlow::MethodCallNode call) { call.getMethodName() = ["match", "matchAll"] and call.getNumArgument() = 1 and ( // Accessing a property that is absent on Match objects exists(string propName | exists(call.getAPropertyRead(propName)) and not isMatchObjectProperty(propName) and not exists(propName.toInt()) ) or // Awaiting the result call.flowsToExpr(any(AwaitExpr await).getOperand()) or // Result is obviously unused call.asExpr() = any(ExprStmt stmt).getExpr() or call = API::moduleImport("sinon").getMember("match").getACall() ) } /** * Holds if `value` is used in a way that suggests it returns a number. */ overlay[global] pragma[inline] private predicate isUsedAsNumber(DataFlow::LocalSourceNode value) { any(Comparison compare) .hasOperands(value.getALocalUse().asExpr(), any(Expr e | canBeNumber(e.analyze()))) or value.flowsToExpr(any(ArithmeticExpr e).getAnOperand()) or value.flowsToExpr(any(UnaryExpr e | e.getOperator() = "-").getOperand()) or value.flowsToExpr(any(IndexExpr expr).getPropertyNameExpr()) or exists(DataFlow::CallNode call | call.getCalleeName() = ["substring", "substr", "slice", "splice", "charAt", "charCodeAt", "codePointAt", "toSpliced"] and value.flowsTo(call.getAnArgument()) ) } bindingset[node] overlay[global] pragma[inline_late] private predicate canBeString(DataFlow::AnalyzedNode node) { node.getAType() = TTString() } bindingset[node] overlay[global] pragma[inline_late] private predicate canBeNumber(DataFlow::AnalyzedNode node) { node.getAType() = TTNumber() } /** * Holds if `source` may be interpreted as a regular expression. */ overlay[global] cached predicate isInterpretedAsRegExp(DataFlow::Node source) { Stages::Taint::ref() and canBeString(source) and ( // The first argument to an invocation of `RegExp` (with or without `new`). source = DataFlow::globalVarRef("RegExp").getAnInvocation().getArgument(0) or // The argument of a call that coerces the argument to a regular expression. exists(DataFlow::MethodCallNode mce, string methodName | canBeString(mce.getReceiver()) and mce.getMethodName() = methodName and not exists(Function func | func = mce.getACallee() | not isNativeStringMethod(func, methodName) ) | methodName = ["match", "matchAll"] and source = mce.getArgument(0) and mce.getNumArgument() = 1 and not isUsedAsNonMatchObject(mce) or methodName = "search" and source = mce.getArgument(0) and mce.getNumArgument() = 1 and // "search" is a common method name, and the built-in "search" method is rarely used, // so to reduce FPs we also require that the return value appears to be used as a number. isUsedAsNumber(mce) ) or exists(DataFlow::SourceNode schema | schema = JsonSchema::getAPartOfJsonSchema() | source = schema.getAPropertyWrite("pattern").getRhs() or source = schema .getAPropertySource("patternProperties") .getAPropertyWrite() .getPropertyNameExpr() .flow() ) ) } /** * Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted * as a part of a regular expression. */ overlay[global] private DataFlow::Node regExpSource(DataFlow::Node re, DataFlow::TypeBackTracker t) { t.start() and re = result and isInterpretedAsRegExp(result) or exists(DataFlow::TypeBackTracker t2, DataFlow::Node succ | succ = regExpSource(re, t2) | t2 = t.smallstep(result, succ) or TaintTracking::sharedTaintStep(result, succ) and t = t2 ) } /** * Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted * as a part of a regular expression. */ overlay[global] private DataFlow::Node regExpSource(DataFlow::Node re) { result = regExpSource(re, DataFlow::TypeBackTracker::end()) } /** * A node whose value may flow to a position where it is interpreted * as a part of a regular expression. */ overlay[global] abstract class RegExpPatternSource extends DataFlow::Node { /** * Gets a node where the pattern of this node is parsed as a part of * a regular expression. */ abstract DataFlow::Node getAParse(); /** * Gets the pattern of this node that is interpreted as a part of a * regular expression. */ abstract string getPattern(); /** * Gets a regular expression object that is constructed from the pattern * of this node. */ abstract DataFlow::SourceNode getARegExpObject(); /** * Gets the root term of the regular expression parsed from this pattern. */ abstract RegExpTerm getRegExpTerm(); } /** * A regular expression literal, viewed as the pattern source for itself. */ overlay[global] private class RegExpLiteralPatternSource extends RegExpPatternSource, DataFlow::ValueNode { override RegExpLiteral astNode; override DataFlow::Node getAParse() { result = this } override string getPattern() { // hide the fact that `/` is escaped in the literal result = astNode.getRoot().getRawValue().regexpReplaceAll("\\\\/", "/") } override DataFlow::SourceNode getARegExpObject() { result = this } override RegExpTerm getRegExpTerm() { result = astNode.getRoot() } } /** * A node whose string value may flow to a position where it is interpreted * as a part of a regular expression. */ overlay[global] private class StringRegExpPatternSource extends RegExpPatternSource { DataFlow::Node parse; StringRegExpPatternSource() { this = regExpSource(parse) } override DataFlow::Node getAParse() { result = parse } override DataFlow::SourceNode getARegExpObject() { exists(DataFlow::InvokeNode constructor | constructor = DataFlow::globalVarRef("RegExp").getAnInvocation() and parse = constructor.getArgument(0) and result = constructor ) } override string getPattern() { result = this.getStringValue() } override RegExpTerm getRegExpTerm() { result = this.asExpr().(StringLiteral).asRegExp() } } /** * A node whose string value may flow to a position where it is interpreted * as a part of a regular expression. */ overlay[global] private class StringConcatRegExpPatternSource extends RegExpPatternSource { DataFlow::Node parse; StringConcatRegExpPatternSource() { this = regExpSource(parse) } override DataFlow::Node getAParse() { result = parse } override DataFlow::SourceNode getARegExpObject() { exists(DataFlow::InvokeNode constructor | constructor = DataFlow::globalVarRef("RegExp").getAnInvocation() and parse = constructor.getArgument(0) and result = constructor ) } override string getPattern() { result = this.getStringValue() } override RegExpTerm getRegExpTerm() { result = this.asExpr().(AddExpr).asRegExp() } } /** * A quoted string escape in a regular expression, using the `\q` syntax. * The only operation supported inside a quoted string is alternation, using `|`. * * Example: * * ``` * \q{foo} * \q{a|b|c} * ``` */ class RegExpQuotedString extends RegExpTerm, @regexp_quoted_string { /** Gets the term representing the contents of this quoted string. */ RegExpTerm getTerm() { result = this.getAChild() } override predicate isNullable() { none() } override string getAMatchedString() { result = this.getTerm().getAMatchedString() } override string getAPrimaryQlClass() { result = "RegExpQuotedString" } } module RegExp { /** Gets the string `"?"` used to represent a regular expression whose flags are unknown. */ string unknownFlag() { result = "?" } /** Holds if `flags` includes the `m` flag. */ bindingset[flags] predicate isMultiline(string flags) { flags.matches("%m%") } /** Holds if `flags` includes the `g` flag. */ bindingset[flags] predicate isGlobal(string flags) { flags.matches("%g%") } /** Holds if `flags` includes the `i` flag. */ bindingset[flags] predicate isIgnoreCase(string flags) { flags.matches("%i%") } /** Holds if `flags` includes the `s` flag. */ bindingset[flags] predicate isDotAll(string flags) { flags.matches("%s%") } /** Holds if `flags` includes the `v` flag. */ bindingset[flags] predicate isUnicodeSets(string flags) { flags.matches("%v%") } /** Holds if `flags` includes the `m` flag or is the unknown flag `?`. */ bindingset[flags] predicate maybeMultiline(string flags) { flags = unknownFlag() or isMultiline(flags) } /** Holds if `flags` includes the `g` flag or is the unknown flag `?`. */ bindingset[flags] predicate maybeGlobal(string flags) { flags = unknownFlag() or isGlobal(flags) } /** Holds if `flags` includes the `i` flag or is the unknown flag `?`. */ bindingset[flags] predicate maybeIgnoreCase(string flags) { flags = unknownFlag() or isIgnoreCase(flags) } /** Holds if `flags` includes the `s` flag or is the unknown flag `?`. */ bindingset[flags] predicate maybeDotAll(string flags) { flags = unknownFlag() or isDotAll(flags) } /** Holds if `term` and all of its disjuncts are anchored on both ends. */ predicate isFullyAnchoredTerm(RegExpTerm term) { exists(RegExpSequence seq | term = seq | seq.getChild(0) instanceof RegExpCaret and seq.getLastChild() instanceof RegExpDollar ) or isFullyAnchoredTerm(term.(RegExpGroup).getAChild()) or isFullyAnchoredAlt(term, term.getNumChild()) } /** Holds if the first `i` disjuncts of `term` are fully anchored. */ private predicate isFullyAnchoredAlt(RegExpAlt term, int i) { isFullyAnchoredTerm(term.getChild(0)) and i = 1 or isFullyAnchoredAlt(term, i - 1) and isFullyAnchoredTerm(term.getChild(i - 1)) } /** * Holds if `term` matches any character except for explicitly listed exceptions. * * For example, holds for `.`, `[^<>]`, or `\W`, but not for `[a-z]`, `\w`, or `[^\W\S]`. */ predicate isWildcardLike(RegExpTerm term) { term instanceof RegExpDot or term.(RegExpCharacterClassEscape).getValue().isUppercase() or // [^a-z] exists(RegExpCharacterClass cls | term = cls | cls.isInverted() and not cls.getAChild().(RegExpCharacterClassEscape).getValue().isUppercase() ) or // [\W] exists(RegExpCharacterClass cls | term = cls | not cls.isInverted() and cls.getAChild().(RegExpCharacterClassEscape).getValue().isUppercase() ) or // an unlimited number of wildcards, is also a wildcard. exists(InfiniteRepetitionQuantifier q | term = q and isWildcardLike(q.getAChild()) ) } /** * Holds if `term` is a generic sanitizer for strings that match (if `outcome` is true) * or strings that don't match (if `outcome` is false). * * Specifically, whitelisting regexps such as `^(foo|bar)$` sanitize matches in the true case. * Inverted character classes such as `[^a-z]` or `\W` sanitize matches in the false case. */ predicate isGenericRegExpSanitizer(RegExpTerm term, boolean outcome) { term.isRootTerm() and ( outcome = true and isFullyAnchoredTerm(term) and not isWildcardLike(term.getAChild*()) or // Character set restrictions like `/[^a-z]/.test(x)` sanitize in the false case outcome = false and exists(RegExpTerm root | root = term or root = term.(RegExpGroup).getAChild() | isWildcardLike(root) or isWildcardLike(root.(RegExpAlt).getAChild()) ) ) } /** * Gets the AST of a regular expression object that can flow to `node`. */ overlay[global] RegExpTerm getRegExpObjectFromNode(DataFlow::Node node) { exists(DataFlow::RegExpCreationNode regexp | regexp.getAReference().flowsTo(node) and result = regexp.getRoot() ) } /** * Gets the AST of a regular expression that can flow to `node`, * including `RegExp` objects as well as strings interpreted as regular expressions. */ overlay[global] RegExpTerm getRegExpFromNode(DataFlow::Node node) { result = getRegExpObjectFromNode(node) or result = node.asExpr().(StringLiteral).asRegExp() } /** * A character that will be analyzed by `RegExp::alwaysMatchesMetaCharacter`. * * Currently only `<`, `'`, and `"` are considered to be meta-characters, but new meta-characters * can be added by subclassing this class. */ abstract class MetaCharacter extends string { bindingset[this] MetaCharacter() { any() } /** * Holds if the given atomic term matches this meta-character. * * Does not hold for derived terms like alternatives and groups. * * By default, `.`, `\W`, `\S`, and `\D` are considered to match any meta-character, * but the predicate can be overridden for meta-characters where this is not the case. */ predicate matchedByAtom(RegExpTerm term) { term.(RegExpConstant).getConstantValue() = this or term instanceof RegExpDot or term.(RegExpCharacterClassEscape).getValue() = ["\\W", "\\S", "\\D"] or exists(string lo, string hi | term.(RegExpCharacterRange).isRange(lo, hi) and lo <= this and this <= hi ) } } /** * A meta character used by HTML. */ private class HtmlMetaCharacter extends MetaCharacter { HtmlMetaCharacter() { this = ["<", "'", "\""] } } /** * A meta character used by regular expressions. */ private class RegexpMetaChars extends RegExp::MetaCharacter { RegexpMetaChars() { this = ["{", "[", "+"] } } /** * Holds if `term` can match any occurrence of `char` within a string (not taking into account * the context in which `term` appears). * * This predicate is under-approximate and never considers sequences to guarantee a match. */ predicate alwaysMatchesMetaCharacter(RegExpTerm term, MetaCharacter char) { not term.getParent() instanceof RegExpSequence and // restrict size of predicate char.matchedByAtom(term) or alwaysMatchesMetaCharacter(term.(RegExpGroup).getAChild(), char) or alwaysMatchesMetaCharacter(term.(RegExpAlt).getAlternative(), char) or exists(RegExpCharacterClass class_ | term = class_ | not class_.isInverted() and char.matchedByAtom(class_.getAChild()) or class_.isInverted() and not char.matchedByAtom(class_.getAChild()) ) } }