diff --git a/python/ql/lib/semmle/python/RegexTreeView.qll b/python/ql/lib/semmle/python/RegexTreeView.qll index 2a11b16ac40..8e2ae1e90f3 100644 --- a/python/ql/lib/semmle/python/RegexTreeView.qll +++ b/python/ql/lib/semmle/python/RegexTreeView.qll @@ -552,7 +552,7 @@ class RegExpWordBoundary extends RegExpSpecialChar { /** * A character class escape in a regular expression. - * That is, an escaped charachter that denotes multiple characters. + * That is, an escaped character that denotes multiple characters. * * Examples: * diff --git a/python/ql/lib/semmle/python/regex.qll b/python/ql/lib/semmle/python/regex.qll index 326ea674390..5f3dea9258e 100644 --- a/python/ql/lib/semmle/python/regex.qll +++ b/python/ql/lib/semmle/python/regex.qll @@ -188,7 +188,7 @@ abstract class RegexString extends Expr { ) } - /** Hold is a character set starts between `start` and `end`. */ + /** Holds if a character set starts between `start` and `end`. */ predicate char_set_start(int start, int end) { this.char_set_start(start) = true and ( @@ -316,8 +316,10 @@ abstract class RegexString extends Expr { result = this.(Bytes).getS() } + /** Gets the `i`th character of this regex */ string getChar(int i) { result = this.getText().charAt(i) } + /** Gets the `i`th character of this regex, unless it is part of a character escape sequence. */ string nonEscapedCharAt(int i) { result = this.getText().charAt(i) and not exists(int x, int y | this.escapedCharacter(x, y) and i in [x .. y - 1]) @@ -329,6 +331,9 @@ abstract class RegexString extends Expr { private predicate isGroupStart(int i) { this.nonEscapedCharAt(i) = "(" and not this.inCharSet(i) } + /** + * Holds if the `i`th character could not be parsed. + */ predicate failedToParse(int i) { exists(this.getChar(i)) and not exists(int start, int end | @@ -417,6 +422,9 @@ abstract class RegexString extends Expr { ) } + /** + * Holds if a simple or escaped character is found between `start` and `end`. + */ predicate character(int start, int end) { ( this.simpleCharacter(start, end) and @@ -428,12 +436,18 @@ abstract class RegexString extends Expr { not exists(int x, int y | this.backreference(x, y) and x <= start and y >= end) } + /** + * Holds if a normal character is found between `start` and `end`. + */ predicate normalCharacter(int start, int end) { end = start + 1 and this.character(start, end) and not this.specialCharacter(start, end, _) } + /** + * Holds if a special character is found between `start` and `end`. + */ predicate specialCharacter(int start, int end, string char) { not this.inCharSet(start) and this.character(start, end) and @@ -492,7 +506,7 @@ abstract class RegexString extends Expr { this.specialCharacter(start, end, _) } - /** Whether the text in the range start,end is a group */ + /** Whether the text in the range `start,end` is a group */ predicate group(int start, int end) { this.groupContents(start, end, _, _) or @@ -611,6 +625,7 @@ abstract class RegexString extends Expr { this.simple_group_start(start, end) } + /** Matches the start of a non-capturing group, e.g. `(?:` */ private predicate non_capturing_group_start(int start, int end) { this.isGroupStart(start) and this.getChar(start + 1) = "?" and @@ -618,12 +633,18 @@ abstract class RegexString extends Expr { end = start + 3 } + /** Matches the start of a simple group, e.g. `(a+)`. */ private predicate simple_group_start(int start, int end) { this.isGroupStart(start) and this.getChar(start + 1) != "?" and end = start + 1 } + /** + * Matches the start of a named group, such as: + * - `(?\w+)` + * - `(?'name'\w+)` + */ private predicate named_group_start(int start, int end) { this.isGroupStart(start) and this.getChar(start + 1) = "?" and @@ -675,6 +696,7 @@ abstract class RegexString extends Expr { ) } + /** Matches the start of a positive lookahead assertion, i.e. `(?=`. */ private predicate lookahead_assertion_start(int start, int end) { this.isGroupStart(start) and this.getChar(start + 1) = "?" and @@ -682,6 +704,7 @@ abstract class RegexString extends Expr { end = start + 3 } + /** Matches the start of a negative lookahead assertion, i.e. `(?!`. */ private predicate negative_lookahead_assertion_start(int start, int end) { this.isGroupStart(start) and this.getChar(start + 1) = "?" and @@ -689,6 +712,7 @@ abstract class RegexString extends Expr { end = start + 3 } + /** Matches the start of a positive lookbehind assertion, i.e. `(?<=`. */ private predicate lookbehind_assertion_start(int start, int end) { this.isGroupStart(start) and this.getChar(start + 1) = "?" and @@ -697,6 +721,7 @@ abstract class RegexString extends Expr { end = start + 4 } + /** Matches the start of a negative lookbehind assertion, i.e. `(?`. */ private predicate named_backreference(int start, int end, string name) { this.named_backreference_start(start, start + 4) and end = min(int i | i > start + 4 and this.getChar(i) = ")") + 1 and name = this.getText().substring(start + 4, end - 2) } + /** Matches a numbered backreference, e.g. `\1`. */ private predicate numbered_backreference(int start, int end, int value) { this.escapingChar(start) and // starting with 0 makes it an octal escape @@ -749,7 +778,7 @@ abstract class RegexString extends Expr { ) } - /** Whether the text in the range start,end is a back reference */ + /** Whether the text in the range `start,end` is a back reference */ predicate backreference(int start, int end) { this.numbered_backreference(start, end, _) or diff --git a/ruby/ql/consistency-queries/RegExpConsistency.ql b/ruby/ql/consistency-queries/RegExpConsistency.ql index 6155b86b7ce..9fd4cf22d95 100644 --- a/ruby/ql/consistency-queries/RegExpConsistency.ql +++ b/ruby/ql/consistency-queries/RegExpConsistency.ql @@ -1,4 +1,4 @@ -import codeql.ruby.security.performance.RegExpTreeView +import codeql.ruby.Regexp query predicate nonUniqueChild(RegExpParent parent, int i, RegExpTerm child) { child = parent.getChild(i) and diff --git a/ruby/ql/lib/change-notes/2022-03-21-regex.md b/ruby/ql/lib/change-notes/2022-03-21-regex.md new file mode 100644 index 00000000000..8abb60d37af --- /dev/null +++ b/ruby/ql/lib/change-notes/2022-03-21-regex.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +* The `ParseRegExp` and `RegExpTreeView` modules are now "internal" modules. Users should use `codeql.ruby.Regexp` instead. diff --git a/ruby/ql/lib/codeql/ruby/Regexp.qll b/ruby/ql/lib/codeql/ruby/Regexp.qll new file mode 100644 index 00000000000..b6872d2bca3 --- /dev/null +++ b/ruby/ql/lib/codeql/ruby/Regexp.qll @@ -0,0 +1,143 @@ +/** + * Provides classes for working with regular expressions. + * + * Regular expression literals are represented as an abstract syntax tree of regular expression + * terms. + */ + +import regexp.RegExpTreeView // re-export +private import regexp.internal.ParseRegExp +private import codeql.ruby.ast.Literal as AST +private import codeql.ruby.DataFlow +private import codeql.ruby.controlflow.CfgNodes +private import codeql.ruby.ApiGraphs +private import codeql.ruby.dataflow.internal.tainttrackingforlibraries.TaintTrackingImpl + +/** + * Provides utility predicates related to regular expressions. + */ +module RegExpPatterns { + /** + * Gets a pattern that matches common top-level domain names in lower case. + */ + string getACommonTld() { + // according to ranking by http://google.com/search?q=site:.<> + result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])" + } +} + +/** + * A node whose value may flow to a position where it is interpreted + * as a part of a regular expression. + */ +abstract class RegExpPatternSource extends DataFlow::Node { + /** + * Gets a node where the pattern of this node is parsed as a part of + * a regular expression. + */ + abstract DataFlow::Node getAParse(); + + /** + * Gets the root term of the regular expression parsed from this pattern. + */ + abstract RegExpTerm getRegExpTerm(); +} + +/** + * A regular expression literal, viewed as the pattern source for itself. + */ +private class RegExpLiteralPatternSource extends RegExpPatternSource { + private AST::RegExpLiteral astNode; + + RegExpLiteralPatternSource() { astNode = this.asExpr().getExpr() } + + override DataFlow::Node getAParse() { result = this } + + override RegExpTerm getRegExpTerm() { result = astNode.getParsed() } +} + +/** + * A node whose string value may flow to a position where it is interpreted + * as a part of a regular expression. + */ +private class StringRegExpPatternSource extends RegExpPatternSource { + private DataFlow::Node parse; + + StringRegExpPatternSource() { this = regExpSource(parse) } + + override DataFlow::Node getAParse() { result = parse } + + override RegExpTerm getRegExpTerm() { result.getRegExp() = this.asExpr().getExpr() } +} + +private class RegExpLiteralRegExp extends RegExp, AST::RegExpLiteral { + override predicate isDotAll() { this.hasMultilineFlag() } + + override predicate isIgnoreCase() { this.hasCaseInsensitiveFlag() } + + override string getFlags() { result = this.getFlagString() } +} + +private class ParsedStringRegExp extends RegExp { + private DataFlow::Node parse; + + ParsedStringRegExp() { this = regExpSource(parse).asExpr().getExpr() } + + DataFlow::Node getAParse() { result = parse } + + override predicate isDotAll() { none() } + + override predicate isIgnoreCase() { none() } + + override string getFlags() { none() } +} + +/** + * Holds if `source` may be interpreted as a regular expression. + */ +private predicate isInterpretedAsRegExp(DataFlow::Node source) { + // The first argument to an invocation of `Regexp.new` or `Regexp.compile`. + source = API::getTopLevelMember("Regexp").getAMethodCall(["compile", "new"]).getArgument(0) + or + // The argument of a call that coerces the argument to a regular expression. + exists(DataFlow::CallNode mce | + mce.getMethodName() = ["match", "match?"] and + source = mce.getArgument(0) and + // exclude https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match + not mce.getReceiver().asExpr().getExpr() instanceof AST::RegExpLiteral + ) +} + +private class RegExpConfiguration extends Configuration { + RegExpConfiguration() { this = "RegExpConfiguration" } + + override predicate isSource(DataFlow::Node source) { + source.asExpr() = + any(ExprCfgNode e | + e.getConstantValue().isString(_) and + not e instanceof ExprNodes::VariableReadAccessCfgNode and + not e instanceof ExprNodes::ConstantReadAccessCfgNode + ) + } + + override predicate isSink(DataFlow::Node sink) { isInterpretedAsRegExp(sink) } + + override predicate isSanitizer(DataFlow::Node node) { + // stop flow if `node` is receiver of + // https://ruby-doc.org/core-2.4.0/String.html#method-i-match + exists(DataFlow::CallNode mce | + mce.getMethodName() = ["match", "match?"] and + node = mce.getReceiver() and + mce.getArgument(0).asExpr().getExpr() instanceof AST::RegExpLiteral + ) + } +} + +/** + * Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted + * as a part of a regular expression. + */ +cached +DataFlow::Node regExpSource(DataFlow::Node re) { + exists(RegExpConfiguration c | c.hasFlow(result, re)) +} diff --git a/ruby/ql/lib/codeql/ruby/ast/Literal.qll b/ruby/ql/lib/codeql/ruby/ast/Literal.qll index d5ff5569180..4892d233951 100644 --- a/ruby/ql/lib/codeql/ruby/ast/Literal.qll +++ b/ruby/ql/lib/codeql/ruby/ast/Literal.qll @@ -1,5 +1,5 @@ private import codeql.ruby.AST -private import codeql.ruby.security.performance.RegExpTreeView as RETV +private import codeql.ruby.Regexp as RE private import internal.AST private import internal.Constant private import internal.Literal @@ -393,7 +393,7 @@ class RegExpLiteral extends StringlikeLiteral instanceof RegExpLiteralImpl { final predicate hasFreeSpacingFlag() { this.getFlagString().charAt(_) = "x" } /** Returns the root node of the parse tree of this regular expression. */ - final RETV::RegExpTerm getParsed() { result = RETV::getParsedRegExp(this) } + final RE::RegExpTerm getParsed() { result = RE::getParsedRegExp(this) } } /** diff --git a/ruby/ql/lib/codeql/ruby/printAst.qll b/ruby/ql/lib/codeql/ruby/printAst.qll index b04a9fec50a..3056e9aa49f 100644 --- a/ruby/ql/lib/codeql/ruby/printAst.qll +++ b/ruby/ql/lib/codeql/ruby/printAst.qll @@ -7,7 +7,7 @@ */ private import AST -private import codeql.ruby.security.performance.RegExpTreeView as RETV +private import codeql.ruby.Regexp as RE private import codeql.ruby.ast.internal.Synthesis /** @@ -37,7 +37,7 @@ private predicate shouldPrintAstEdge(AstNode parent, string edgeName, AstNode ch newtype TPrintNode = TPrintRegularAstNode(AstNode n) { shouldPrintNode(n) } or - TPrintRegExpNode(RETV::RegExpTerm term) { + TPrintRegExpNode(RE::RegExpTerm term) { exists(RegExpLiteral literal | shouldPrintNode(literal) and term.getRootTerm() = literal.getParsed() @@ -107,7 +107,7 @@ class PrintRegularAstNode extends PrintAstNode, TPrintRegularAstNode { or // If this AST node is a regexp literal, add the parsed regexp tree as a // child. - exists(RETV::RegExpTerm t | t = astNode.(RegExpLiteral).getParsed() | + exists(RE::RegExpTerm t | t = astNode.(RegExpLiteral).getParsed() | result = TPrintRegExpNode(t) and edgeName = "getParsed" ) } @@ -134,7 +134,7 @@ class PrintRegularAstNode extends PrintAstNode, TPrintRegularAstNode { /** A parsed regexp node in the output tree. */ class PrintRegExpNode extends PrintAstNode, TPrintRegExpNode { - RETV::RegExpTerm regexNode; + RE::RegExpTerm regexNode; PrintRegExpNode() { this = TPrintRegExpNode(regexNode) } @@ -147,7 +147,7 @@ class PrintRegExpNode extends PrintAstNode, TPrintRegExpNode { exists(int i | result = TPrintRegExpNode(regexNode.getChild(i)) and edgeName = i.toString()) } - override int getOrder() { exists(RETV::RegExpTerm p | p.getChild(result) = regexNode) } + override int getOrder() { exists(RE::RegExpTerm p | p.getChild(result) = regexNode) } override predicate hasLocationInfo( string filepath, int startline, int startcolumn, int endline, int endcolumn diff --git a/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll new file mode 100644 index 00000000000..563130b1c42 --- /dev/null +++ b/ruby/ql/lib/codeql/ruby/regexp/RegExpTreeView.qll @@ -0,0 +1,1030 @@ +/** Provides a class hierarchy corresponding to a parse tree of regular expressions. */ + +private import internal.ParseRegExp +private import codeql.NumberUtils +private import codeql.ruby.ast.Literal as AST +private import codeql.Locations + +/** + * An element containing a regular expression term, that is, either + * a string literal (parsed as a regular expression) + * or another regular expression term. + * + * For sequences and alternations, we require at least one child. + * Otherwise, we wish to represent the term differently. + * This avoids multiple representations of the same term. + */ +private newtype TRegExpParent = + /** A string literal used as a regular expression */ + TRegExpLiteral(RegExp re) or + /** A quantified term */ + TRegExpQuantifier(RegExp re, int start, int end) { re.qualifiedItem(start, end, _, _) } or + /** A sequence term */ + TRegExpSequence(RegExp re, int start, int end) { re.sequence(start, end) } or + /** An alternation term */ + TRegExpAlt(RegExp re, int start, int end) { re.alternation(start, end) } or + /** A character class term */ + TRegExpCharacterClass(RegExp re, int start, int end) { re.charSet(start, end) } or + /** A character range term */ + TRegExpCharacterRange(RegExp re, int start, int end) { re.charRange(_, start, _, _, end) } or + /** A group term */ + TRegExpGroup(RegExp re, int start, int end) { re.group(start, end) } or + /** A special character */ + TRegExpSpecialChar(RegExp re, int start, int end) { re.specialCharacter(start, end, _) } or + /** A normal character */ + TRegExpNormalChar(RegExp re, int start, int end) { + re.normalCharacterSequence(start, end) + or + re.escapedCharacter(start, end) and + not re.specialCharacter(start, end, _) + } or + /** A back reference */ + TRegExpBackRef(RegExp re, int start, int end) { re.backreference(start, end) } or + /** A named character property */ + TRegExpNamedCharacterProperty(RegExp re, int start, int end) { + re.namedCharacterProperty(start, end, _) + } + +/** + * An element containing a regular expression term, that is, either + * a string literal (parsed as a regular expression) + * or another regular expression term. + */ +class RegExpParent extends TRegExpParent { + /** Gets a textual representation of this element. */ + string toString() { result = "RegExpParent" } + + /** Gets the `i`th child term. */ + RegExpTerm getChild(int i) { none() } + + /** Gets a child term . */ + final RegExpTerm getAChild() { result = this.getChild(_) } + + /** Gets the number of child terms. */ + int getNumChild() { result = count(this.getAChild()) } + + /** + * Gets the name of a primary CodeQL class to which this regular + * expression term belongs. + */ + string getAPrimaryQlClass() { result = "RegExpParent" } + + /** + * Gets a comma-separated list of the names of the primary CodeQL classes to + * which this regular expression term belongs. + */ + final string getPrimaryQlClasses() { result = concat(this.getAPrimaryQlClass(), ",") } +} + +/** A string literal used as a regular expression */ +class RegExpLiteral extends TRegExpLiteral, RegExpParent { + RegExp re; + + RegExpLiteral() { this = TRegExpLiteral(re) } + + override RegExpTerm getChild(int i) { i = 0 and result.getRegExp() = re and result.isRootTerm() } + + /** Holds if dot, `.`, matches all characters, including newlines. */ + predicate isDotAll() { re.isDotAll() } + + /** Holds if this regex matching is case-insensitive for this regex. */ + predicate isIgnoreCase() { re.isIgnoreCase() } + + /** Get a string representing all modes for this regex. */ + string getFlags() { result = re.getFlags() } + + /** Gets the primary QL class for this regex. */ + override string getAPrimaryQlClass() { result = "RegExpLiteral" } +} + +/** + * A regular expression term, that is, a syntactic part of a regular expression. + */ +class RegExpTerm extends RegExpParent { + RegExp re; + int start; + int end; + + RegExpTerm() { + this = TRegExpAlt(re, start, end) + or + this = TRegExpBackRef(re, start, end) + or + this = TRegExpCharacterClass(re, start, end) + or + this = TRegExpCharacterRange(re, start, end) + or + this = TRegExpNormalChar(re, start, end) + or + this = TRegExpGroup(re, start, end) + or + this = TRegExpQuantifier(re, start, end) + or + this = TRegExpSequence(re, start, end) and + exists(seqChild(re, start, end, 1)) // if a sequence does not have more than one element, it should be treated as that element instead. + or + this = TRegExpSpecialChar(re, start, end) + or + this = TRegExpNamedCharacterProperty(re, start, end) + } + + /** + * Gets the outermost term of this regular expression. + */ + RegExpTerm getRootTerm() { + this.isRootTerm() and result = this + or + result = this.getParent().(RegExpTerm).getRootTerm() + } + + /** + * Holds if this term is part of a string literal + * that is interpreted as a regular expression. + */ + predicate isUsedAsRegExp() { any() } + + /** + * Holds if this is the root term of a regular expression. + */ + predicate isRootTerm() { start = 0 and end = re.getText().length() } + + override RegExpTerm getChild(int i) { + result = this.(RegExpAlt).getChild(i) + or + result = this.(RegExpBackRef).getChild(i) + or + result = this.(RegExpCharacterClass).getChild(i) + or + result = this.(RegExpCharacterRange).getChild(i) + or + result = this.(RegExpNormalChar).getChild(i) + or + result = this.(RegExpGroup).getChild(i) + or + result = this.(RegExpQuantifier).getChild(i) + or + result = this.(RegExpSequence).getChild(i) + or + result = this.(RegExpSpecialChar).getChild(i) + or + result = this.(RegExpNamedCharacterProperty).getChild(i) + } + + /** + * Gets the parent term of this regular expression term, or the + * regular expression literal if this is the root term. + */ + RegExpParent getParent() { result.getAChild() = this } + + /** Gets the associated `RegExp`. */ + RegExp getRegExp() { result = re } + + /** Gets the offset at which this term starts. */ + int getStart() { result = start } + + /** Gets the offset at which this term ends. */ + int getEnd() { result = end } + + override string toString() { result = re.getText().substring(start, end) } + + /** + * Gets the location of the surrounding regex, as locations inside the regex do not exist. + * To get location information corresponding to the term inside the regex, + * use `hasLocationInfo`. + */ + Location getLocation() { result = re.getLocation() } + + /** Holds if this term is found at the specified location offsets. */ + predicate hasLocationInfo( + string filepath, int startline, int startcolumn, int endline, int endcolumn + ) { + exists(int re_start, int re_end | + re.getComponent(0).getLocation().hasLocationInfo(filepath, startline, re_start, _, _) and + re.getComponent(re.getNumberOfComponents() - 1) + .getLocation() + .hasLocationInfo(filepath, _, _, endline, re_end) + | + startcolumn = re_start + start and + endcolumn = re_start + end - 1 + ) + } + + /** Gets the file in which this term is found. */ + File getFile() { result = this.getLocation().getFile() } + + /** Gets the raw source text of this term. */ + string getRawValue() { result = this.toString() } + + /** Gets the string literal in which this term is found. */ + RegExpLiteral getLiteral() { result = TRegExpLiteral(re) } + + /** Gets the regular expression term that is matched (textually) before this one, if any. */ + RegExpTerm getPredecessor() { + exists(RegExpTerm parent | parent = this.getParent() | + result = parent.(RegExpSequence).previousElement(this) + or + not exists(parent.(RegExpSequence).previousElement(this)) and + not parent instanceof RegExpSubPattern and + result = parent.getPredecessor() + ) + } + + /** Gets the regular expression term that is matched (textually) after this one, if any. */ + RegExpTerm getSuccessor() { + exists(RegExpTerm parent | parent = this.getParent() | + result = parent.(RegExpSequence).nextElement(this) + or + not exists(parent.(RegExpSequence).nextElement(this)) and + not parent instanceof RegExpSubPattern and + result = parent.getSuccessor() + ) + } + + /** Gets the primary QL class for this term. */ + override string getAPrimaryQlClass() { result = "RegExpTerm" } +} + +/** + * A quantified regular expression term. + * + * Example: + * + * ``` + * ((ECMA|Java)[sS]cript)* + * ``` + */ +class RegExpQuantifier extends RegExpTerm, TRegExpQuantifier { + int part_end; + boolean may_repeat_forever; + + RegExpQuantifier() { + this = TRegExpQuantifier(re, start, end) and + re.qualifiedPart(start, part_end, end, _, may_repeat_forever) + } + + override RegExpTerm getChild(int i) { + i = 0 and + result.getRegExp() = re and + result.getStart() = start and + result.getEnd() = part_end + } + + /** Hols if this term may match an unlimited number of times. */ + predicate mayRepeatForever() { may_repeat_forever = true } + + /** Gets the qualifier for this term. That is e.g "?" for "a?". */ + string getQualifier() { result = re.getText().substring(part_end, end) } + + override string getAPrimaryQlClass() { result = "RegExpQuantifier" } +} + +/** + * A regular expression term that permits unlimited repetitions. + */ +class InfiniteRepetitionQuantifier extends RegExpQuantifier { + InfiniteRepetitionQuantifier() { this.mayRepeatForever() } + + override string getAPrimaryQlClass() { result = "InfiniteRepetitionQuantifier" } +} + +/** + * A star-quantified term. + * + * Example: + * + * ``` + * \w* + * ``` + */ +class RegExpStar extends InfiniteRepetitionQuantifier { + RegExpStar() { this.getQualifier().charAt(0) = "*" } + + override string getAPrimaryQlClass() { result = "RegExpStar" } +} + +/** + * A plus-quantified term. + * + * Example: + * + * ``` + * \w+ + * ``` + */ +class RegExpPlus extends InfiniteRepetitionQuantifier { + RegExpPlus() { this.getQualifier().charAt(0) = "+" } + + override string getAPrimaryQlClass() { result = "RegExpPlus" } +} + +/** + * An optional term. + * + * Example: + * + * ``` + * ;? + * ``` + */ +class RegExpOpt extends RegExpQuantifier { + RegExpOpt() { this.getQualifier().charAt(0) = "?" } + + override string getAPrimaryQlClass() { result = "RegExpOpt" } +} + +/** + * A range-quantified term + * + * Examples: + * + * ``` + * \w{2,4} + * \w{2,} + * \w{2} + * ``` + */ +class RegExpRange extends RegExpQuantifier { + string upper; + string lower; + + RegExpRange() { re.multiples(part_end, end, lower, upper) } + + /** Gets the string defining the upper bound of this range, if any. */ + string getUpper() { result = upper } + + /** Gets the string defining the lower bound of this range, if any. */ + string getLower() { result = lower } + + /** + * Gets the upper bound of the range, if any. + * + * If there is no upper bound, any number of repetitions is allowed. + * For a term of the form `r{lo}`, both the lower and the upper bound + * are `lo`. + */ + int getUpperBound() { result = this.getUpper().toInt() } + + /** Gets the lower bound of the range. */ + int getLowerBound() { result = this.getLower().toInt() } + + override string getAPrimaryQlClass() { result = "RegExpRange" } +} + +/** + * A sequence term. + * + * Example: + * + * ``` + * (ECMA|Java)Script + * ``` + * + * This is a sequence with the elements `(ECMA|Java)` and `Script`. + */ +class RegExpSequence extends RegExpTerm, TRegExpSequence { + RegExpSequence() { + this = TRegExpSequence(re, start, end) and + exists(seqChild(re, start, end, 1)) // if a sequence does not have more than one element, it should be treated as that element instead. + } + + override RegExpTerm getChild(int i) { result = seqChild(re, start, end, i) } + + /** Gets the element preceding `element` in this sequence. */ + RegExpTerm previousElement(RegExpTerm element) { element = this.nextElement(result) } + + /** Gets the element following `element` in this sequence. */ + RegExpTerm nextElement(RegExpTerm element) { + exists(int i | + element = this.getChild(i) and + result = this.getChild(i + 1) + ) + } + + override string getAPrimaryQlClass() { result = "RegExpSequence" } +} + +pragma[nomagic] +private int seqChildEnd(RegExp re, int start, int end, int i) { + result = seqChild(re, start, end, i).getEnd() +} + +// moved out so we can use it in the charpred +private RegExpTerm seqChild(RegExp re, int start, int end, int i) { + re.sequence(start, end) and + ( + i = 0 and + result.getRegExp() = re and + result.getStart() = start and + exists(int itemEnd | + re.item(start, itemEnd) and + result.getEnd() = itemEnd + ) + or + i > 0 and + result.getRegExp() = re and + exists(int itemStart | itemStart = seqChildEnd(re, start, end, i - 1) | + result.getStart() = itemStart and + re.item(itemStart, result.getEnd()) + ) + ) +} + +/** + * An alternative term, that is, a term of the form `a|b`. + * + * Example: + * + * ``` + * ECMA|Java + * ``` + */ +class RegExpAlt extends RegExpTerm, TRegExpAlt { + RegExpAlt() { this = TRegExpAlt(re, start, end) } + + override RegExpTerm getChild(int i) { + i = 0 and + result.getRegExp() = re and + result.getStart() = start and + exists(int part_end | + re.alternationOption(start, end, start, part_end) and + result.getEnd() = part_end + ) + or + i > 0 and + result.getRegExp() = re and + exists(int part_start | + part_start = this.getChild(i - 1).getEnd() + 1 // allow for the | + | + result.getStart() = part_start and + re.alternationOption(start, end, part_start, result.getEnd()) + ) + } + + override string getAPrimaryQlClass() { result = "RegExpAlt" } +} + +class RegExpCharEscape = RegExpEscape; + +/** + * An escaped regular expression term, that is, a regular expression + * term starting with a backslash, which is not a backreference. + * + * Example: + * + * ``` + * \. + * \w + * ``` + */ +class RegExpEscape extends RegExpNormalChar { + RegExpEscape() { re.escapedCharacter(start, end) } + + /** + * Gets the name of the escaped; for example, `w` for `\w`. + * TODO: Handle named escapes. + */ + override string getValue() { + this.isIdentityEscape() and result = this.getUnescaped() + or + this.getUnescaped() = "n" and result = "\n" + or + this.getUnescaped() = "r" and result = "\r" + or + this.getUnescaped() = "t" and result = "\t" + or + this.getUnescaped() = "f" and result = 12.toUnicode() + or + this.getUnescaped() = "v" and result = 11.toUnicode() + or + this.isUnicode() and + result = this.getUnicode() + } + + /** Holds if this terms name is given by the part following the escape character. */ + predicate isIdentityEscape() { + not this.getUnescaped() in ["n", "r", "t", "f", "v"] and not this.isUnicode() + } + + override string getAPrimaryQlClass() { result = "RegExpEscape" } + + /** Gets the part of the term following the escape character. That is e.g. "w" if the term is "\w". */ + string getUnescaped() { result = this.getText().suffix(1) } + + /** + * Gets the text for this escape. That is e.g. "\w". + */ + private string getText() { result = re.getText().substring(start, end) } + + /** + * Holds if this is a unicode escape. + */ + private predicate isUnicode() { this.getText().prefix(2) = ["\\u", "\\U"] } + + /** + * Gets the unicode char for this escape. + * E.g. for `\u0061` this returns "a". + */ + private string getUnicode() { + this.isUnicode() and + result = parseHexInt(this.getText().suffix(2)).toUnicode() + } +} + +/** + * A word boundary, that is, a regular expression term of the form `\b`. + */ +class RegExpWordBoundary extends RegExpSpecialChar { + RegExpWordBoundary() { this.getChar() = "\\b" } +} + +/** + * A character class escape in a regular expression. + * That is, an escaped character that denotes multiple characters. + * + * Examples: + * + * ``` + * \w + * \S + * ``` + */ +class RegExpCharacterClassEscape extends RegExpEscape { + RegExpCharacterClassEscape() { this.getValue() in ["d", "D", "s", "S", "w", "W", "h", "H"] } + + override RegExpTerm getChild(int i) { none() } + + override string getAPrimaryQlClass() { result = "RegExpCharacterClassEscape" } +} + +/** + * A character class in a regular expression. + * + * Examples: + * + * ```rb + * /[a-fA-F0-9]/ + * /[^abc]/ + * ``` + */ +class RegExpCharacterClass extends RegExpTerm, TRegExpCharacterClass { + RegExpCharacterClass() { this = TRegExpCharacterClass(re, start, end) } + + /** Holds if this character class is inverted, matching the opposite of its content. */ + predicate isInverted() { re.getChar(start + 1) = "^" } + + /** Holds if this character class can match anything. */ + predicate isUniversalClass() { + // [^] + this.isInverted() and not exists(this.getAChild()) + or + // [\w\W] and similar + not this.isInverted() and + exists(string cce1, string cce2 | + cce1 = this.getAChild().(RegExpCharacterClassEscape).getValue() and + cce2 = this.getAChild().(RegExpCharacterClassEscape).getValue() + | + cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase() + ) + } + + override RegExpTerm getChild(int i) { + i = 0 and + result.getRegExp() = re and + exists(int itemStart, int itemEnd | + result.getStart() = itemStart and + re.charSetStart(start, itemStart) and + re.charSetChild(start, itemStart, itemEnd) and + result.getEnd() = itemEnd + ) + or + i > 0 and + result.getRegExp() = re and + exists(int itemStart | itemStart = this.getChild(i - 1).getEnd() | + result.getStart() = itemStart and + re.charSetChild(start, itemStart, result.getEnd()) + ) + } + + override string getAPrimaryQlClass() { result = "RegExpCharacterClass" } +} + +/** + * A character range in a character class in a regular expression. + * + * Example: + * + * ``` + * a-z + * ``` + */ +class RegExpCharacterRange extends RegExpTerm, TRegExpCharacterRange { + int lower_end; + int upper_start; + + RegExpCharacterRange() { + this = TRegExpCharacterRange(re, start, end) and + re.charRange(_, start, lower_end, upper_start, end) + } + + /** Holds if this range goes from `lo` to `hi`, in effect is `lo-hi`. */ + predicate isRange(string lo, string hi) { + lo = re.getText().substring(start, lower_end) and + hi = re.getText().substring(upper_start, end) + } + + override RegExpTerm getChild(int i) { + i = 0 and + result.getRegExp() = re and + result.getStart() = start and + result.getEnd() = lower_end + or + i = 1 and + result.getRegExp() = re and + result.getStart() = upper_start and + result.getEnd() = end + } + + override string getAPrimaryQlClass() { result = "RegExpCharacterRange" } +} + +/** + * A normal character in a regular expression, that is, a character + * without special meaning. This includes escaped characters. + * + * Examples: + * ``` + * t + * \t + * ``` + */ +class RegExpNormalChar extends RegExpTerm, TRegExpNormalChar { + RegExpNormalChar() { this = TRegExpNormalChar(re, start, end) } + + /** + * Holds if this constant represents a valid Unicode character (as opposed + * to a surrogate code point that does not correspond to a character by itself.) + */ + predicate isCharacter() { any() } + + /** Gets the string representation of the char matched by this term. */ + string getValue() { result = re.getText().substring(start, end) } + + override RegExpTerm getChild(int i) { none() } + + override string getAPrimaryQlClass() { result = "RegExpNormalChar" } +} + +/** + * A constant regular expression term, that is, a regular expression + * term matching a single string. Currently, this will always be a single character. + * + * Example: + * + * ``` + * a + * ``` + */ +class RegExpConstant extends RegExpTerm { + string value; + + RegExpConstant() { + this = TRegExpNormalChar(re, start, end) and + not this instanceof RegExpCharacterClassEscape and + // exclude chars in qualifiers + // TODO: push this into regex library + not exists(int qstart, int qend | re.qualifiedPart(_, qstart, qend, _, _) | + qstart <= start and end <= qend + ) and + value = this.(RegExpNormalChar).getValue() + or + this = TRegExpSpecialChar(re, start, end) and + re.inCharSet(start) and + value = this.(RegExpSpecialChar).getChar() + } + + /** + * Holds if this constant represents a valid Unicode character (as opposed + * to a surrogate code point that does not correspond to a character by itself.) + */ + predicate isCharacter() { any() } + + /** Gets the string matched by this constant term. */ + string getValue() { result = value } + + override RegExpTerm getChild(int i) { none() } + + override string getAPrimaryQlClass() { result = "RegExpConstant" } +} + +/** + * A grouped regular expression. + * + * Examples: + * + * ``` + * (ECMA|Java) + * (?:ECMA|Java) + * (?['"]) + * ``` + */ +class RegExpGroup extends RegExpTerm, TRegExpGroup { + RegExpGroup() { this = TRegExpGroup(re, start, end) } + + /** + * Gets the index of this capture group within the enclosing regular + * expression literal. + * + * For example, in the regular expression `/((a?).)(?:b)/`, the + * group `((a?).)` has index 1, the group `(a?)` nested inside it + * has index 2, and the group `(?:b)` has no index, since it is + * not a capture group. + */ + int getNumber() { result = re.getGroupNumber(start, end) } + + /** Holds if this is a capture group. */ + predicate isCapture() { exists(this.getNumber()) } + + /** Holds if this is a named capture group. */ + predicate isNamed() { exists(this.getName()) } + + /** Gets the name of this capture group, if any. */ + string getName() { result = re.getGroupName(start, end) } + + override RegExpTerm getChild(int i) { + result.getRegExp() = re and + i = 0 and + re.groupContents(start, end, result.getStart(), result.getEnd()) + } + + override string getAPrimaryQlClass() { result = "RegExpGroup" } +} + +/** + * A special character in a regular expression. + * + * Examples: + * ``` + * ^ + * $ + * . + * ``` + */ +class RegExpSpecialChar extends RegExpTerm, TRegExpSpecialChar { + string char; + + RegExpSpecialChar() { + this = TRegExpSpecialChar(re, start, end) and + re.specialCharacter(start, end, char) + } + + /** + * Holds if this constant represents a valid Unicode character (as opposed + * to a surrogate code point that does not correspond to a character by itself.) + */ + predicate isCharacter() { any() } + + /** Gets the char for this term. */ + string getChar() { result = char } + + override RegExpTerm getChild(int i) { none() } + + override string getAPrimaryQlClass() { result = "RegExpSpecialChar" } +} + +/** + * A dot regular expression. + * + * Example: + * + * ``` + * . + * ``` + */ +class RegExpDot extends RegExpSpecialChar { + RegExpDot() { this.getChar() = "." } + + override string getAPrimaryQlClass() { result = "RegExpDot" } +} + +/** + * A dollar assertion `$` or `\Z` matching the end of a line. + * + * Example: + * + * ``` + * $ + * ``` + */ +class RegExpDollar extends RegExpSpecialChar { + RegExpDollar() { this.getChar() = ["$", "\\Z", "\\z"] } + + override string getAPrimaryQlClass() { result = "RegExpDollar" } +} + +/** + * A caret assertion `^` or `\A` matching the beginning of a line. + * + * Example: + * + * ``` + * ^ + * ``` + */ +class RegExpCaret extends RegExpSpecialChar { + RegExpCaret() { this.getChar() = ["^", "\\A"] } + + override string getAPrimaryQlClass() { result = "RegExpCaret" } +} + +/** + * A zero-width match, that is, either an empty group or an assertion. + * + * Examples: + * ``` + * () + * (?=\w) + * ``` + */ +class RegExpZeroWidthMatch extends RegExpGroup { + RegExpZeroWidthMatch() { re.zeroWidthMatch(start, end) } + + override RegExpTerm getChild(int i) { none() } + + override string getAPrimaryQlClass() { result = "RegExpZeroWidthMatch" } +} + +/** + * A zero-width lookahead or lookbehind assertion. + * + * Examples: + * + * ``` + * (?=\w) + * (?!\n) + * (?<=\.) + * (?` + * in a regular expression. + * + * Examples: + * + * ``` + * \1 + * (?P=quote) + * ``` + */ +class RegExpBackRef extends RegExpTerm, TRegExpBackRef { + RegExpBackRef() { this = TRegExpBackRef(re, start, end) } + + /** + * Gets the number of the capture group this back reference refers to, if any. + */ + int getNumber() { result = re.getBackRefNumber(start, end) } + + /** + * Gets the name of the capture group this back reference refers to, if any. + */ + string getName() { result = re.getBackRefName(start, end) } + + /** Gets the capture group this back reference refers to. */ + RegExpGroup getGroup() { + result.getLiteral() = this.getLiteral() and + ( + result.getNumber() = this.getNumber() or + result.getName() = this.getName() + ) + } + + override RegExpTerm getChild(int i) { none() } + + override string getAPrimaryQlClass() { result = "RegExpBackRef" } +} + +/** + * A named character property. For example, the POSIX bracket expression + * `[[:digit:]]`. + */ +class RegExpNamedCharacterProperty extends RegExpTerm, TRegExpNamedCharacterProperty { + RegExpNamedCharacterProperty() { this = TRegExpNamedCharacterProperty(re, start, end) } + + override RegExpTerm getChild(int i) { none() } + + override string getAPrimaryQlClass() { result = "RegExpNamedCharacterProperty" } + + /** + * Gets the property name. For example, in `\p{Space}`, the result is + * `"Space"`. + */ + string getName() { result = re.getCharacterPropertyName(start, end) } + + /** + * Holds if the property is inverted. For example, it holds for `\p{^Digit}`, + * which matches non-digits. + */ + predicate isInverted() { re.namedCharacterPropertyIsInverted(start, end) } +} + +/** Gets the parse tree resulting from parsing `re`, if such has been constructed. */ +RegExpTerm getParsedRegExp(AST::RegExpLiteral re) { + result.getRegExp() = re and result.isRootTerm() +} diff --git a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll b/ruby/ql/lib/codeql/ruby/regexp/internal/ParseRegExp.qll similarity index 90% rename from ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll rename to ruby/ql/lib/codeql/ruby/regexp/internal/ParseRegExp.qll index ac5c213b855..b83ad0685ff 100644 --- a/ruby/ql/lib/codeql/ruby/security/performance/ParseRegExp.qll +++ b/ruby/ql/lib/codeql/ruby/regexp/internal/ParseRegExp.qll @@ -7,10 +7,6 @@ private import codeql.ruby.ast.Literal as AST private import codeql.Locations -private import codeql.ruby.DataFlow -private import codeql.ruby.controlflow.CfgNodes -private import codeql.ruby.ApiGraphs -private import codeql.ruby.dataflow.internal.tainttrackingforlibraries.TaintTrackingImpl /** * A `StringlikeLiteral` containing a regular expression term, that is, either @@ -116,6 +112,7 @@ abstract class RegExp extends AST::StringlikeLiteral { ) } + /** Holds if a character set starts between `start` and `end`. */ predicate charSetStart(int start, int end) { this.charSetStart(start) = true and ( @@ -145,14 +142,21 @@ abstract class RegExp extends AST::StringlikeLiteral { ) } - predicate charSetToken(int charsetStart, int index, int tokenStart, int tokenEnd) { + /** + * Holds if the character set starting at `charsetStart` contains either + * a character or a `-` found between `start` and `end`. + */ + private predicate charSetToken(int charsetStart, int index, int tokenStart, int tokenEnd) { tokenStart = rank[index](int start, int end | this.charSetToken(charsetStart, start, end) | start) and this.charSetToken(charsetStart, tokenStart, tokenEnd) } - /** Either a char or a - */ - predicate charSetToken(int charsetStart, int start, int end) { + /** + * Holds if the character set starting at `charsetStart` contains either + * a character or a `-` found between `start` and `end`. + */ + private predicate charSetToken(int charsetStart, int start, int end) { this.charSetStart(charsetStart, start) and ( this.escapedCharacter(start, end) @@ -174,6 +178,10 @@ abstract class RegExp extends AST::StringlikeLiteral { ) } + /** + * Holds if the character set starting at `charsetStart` contains either + * a character or a range found between `start` and `end`. + */ predicate charSetChild(int charsetStart, int start, int end) { this.charSetToken(charsetStart, start, end) and not exists(int rangeStart, int rangeEnd | @@ -185,6 +193,11 @@ abstract class RegExp extends AST::StringlikeLiteral { this.charRange(charsetStart, start, _, _, end) } + /** + * Holds if the character set starting at `charset_start` contains a character range + * with lower bound found between `start` and `lower_end` + * and upper bound found between `upper_start` and `end`. + */ predicate charRange(int charsetStart, int start, int lowerEnd, int upperStart, int end) { exists(int index | this.charRangeEnd(charsetStart, index) = true and @@ -193,6 +206,13 @@ abstract class RegExp extends AST::StringlikeLiteral { ) } + /** + * Helper predicate for `charRange`. + * We can determine where character ranges end by a left to right sweep. + * + * To avoid negative recursion we return a boolean. See `escaping`, + * the helper for `escapingChar`, for a clean use of this pattern. + */ private boolean charRangeEnd(int charsetStart, int index) { this.charSetToken(charsetStart, index, _, _) and ( @@ -216,8 +236,15 @@ abstract class RegExp extends AST::StringlikeLiteral { ) } + /** Holds if the character at `pos` is a "\" that is actually escaping what comes after. */ predicate escapingChar(int pos) { this.escaping(pos) = true } + /** + * Helper predicate for `escapingChar`. + * In order to avoid negative recusrion, we return a boolean. + * This way, we can refer to `escaping(pos - 1).booleanNot()` + * rather than to a negated version of `escaping(pos)`. + */ private boolean escaping(int pos) { pos = -1 and result = false or @@ -229,8 +256,10 @@ abstract class RegExp extends AST::StringlikeLiteral { /** Gets the text of this regex */ string getText() { result = this.getConstantValue().getString() } + /** Gets the `i`th character of this regex */ string getChar(int i) { result = this.getText().charAt(i) } + /** Gets the `i`th character of this regex, unless it is part of a character escape sequence. */ string nonEscapedCharAt(int i) { result = this.getText().charAt(i) and not exists(int x, int y | this.escapedCharacter(x, y) and i in [x .. y - 1]) @@ -242,6 +271,9 @@ abstract class RegExp extends AST::StringlikeLiteral { private predicate isGroupStart(int i) { this.nonEscapedCharAt(i) = "(" and not this.inCharSet(i) } + /** + * Holds if the `i`th character could not be parsed. + */ predicate failedToParse(int i) { exists(this.getChar(i)) and not exists(int start, int end | @@ -331,6 +363,11 @@ abstract class RegExp extends AST::StringlikeLiteral { this.getChar(start + 3) = "^" } + /** + * Holds if an escaped character is found between `start` and `end`. + * Escaped characters include hex values, octal values and named escapes, + * but excludes backreferences. + */ predicate escapedCharacter(int start, int end) { this.escapingChar(start) and not this.numberedBackreference(start, _, _) and @@ -350,17 +387,25 @@ abstract class RegExp extends AST::StringlikeLiteral { ) } + /** + * Holds if the character at `index` is inside a character set. + */ predicate inCharSet(int index) { exists(int x, int y | this.charSet(x, y) and index in [x + 1 .. y - 2]) } + /** + * Holds if the character at `index` is inside a posix bracket. + */ predicate inPosixBracket(int index) { exists(int x, int y | this.posixStyleNamedCharacterProperty(x, y, _) and index in [x + 1 .. y - 2] ) } - /** 'Simple' characters are any that don't alter the parsing of the regex. */ + /** + * 'simple' characters are any that don't alter the parsing of the regex. + */ private predicate simpleCharacter(int start, int end) { end = start + 1 and not this.charSet(start, _) and @@ -391,6 +436,9 @@ abstract class RegExp extends AST::StringlikeLiteral { ) } + /** + * Holds if a simple or escaped character is found between `start` and `end`. + */ predicate character(int start, int end) { ( this.simpleCharacter(start, end) and @@ -406,12 +454,18 @@ abstract class RegExp extends AST::StringlikeLiteral { not exists(int x, int y | this.multiples(x, y, _, _) and x <= start and y >= end) } + /** + * Holds if a normal character is found between `start` and `end`. + */ predicate normalCharacter(int start, int end) { end = start + 1 and this.character(start, end) and not this.specialCharacter(start, end, _) } + /** + * Holds if a special character is found between `start` and `end`. + */ predicate specialCharacter(int start, int end, string char) { this.character(start, end) and not this.inCharSet(start) and @@ -505,6 +559,7 @@ abstract class RegExp extends AST::StringlikeLiteral { this.positiveLookbehindAssertionGroup(start, end) } + /** Holds if an empty group is found between `start` and `end`. */ predicate emptyGroup(int start, int end) { exists(int endm1 | end = endm1 + 1 | this.groupStart(start, endm1) and @@ -538,24 +593,28 @@ abstract class RegExp extends AST::StringlikeLiteral { ) } + /** Holds if a negative lookahead is found between `start` and `end` */ predicate negativeLookaheadAssertionGroup(int start, int end) { exists(int inStart | this.negativeLookaheadAssertionStart(start, inStart) | this.groupContents(start, end, inStart, _) ) } + /** Holds if a negative lookbehind is found between `start` and `end` */ predicate negativeLookbehindAssertionGroup(int start, int end) { exists(int inStart | this.negativeLookbehindAssertionStart(start, inStart) | this.groupContents(start, end, inStart, _) ) } + /** Holds if a positive lookahead is found between `start` and `end` */ predicate positiveLookaheadAssertionGroup(int start, int end) { exists(int inStart | this.lookaheadAssertionStart(start, inStart) | this.groupContents(start, end, inStart, _) ) } + /** Holds if a positive lookbehind is found between `start` and `end` */ predicate positiveLookbehindAssertionGroup(int start, int end) { exists(int inStart | this.lookbehindAssertionStart(start, inStart) | this.groupContents(start, end, inStart, _) @@ -661,6 +720,7 @@ abstract class RegExp extends AST::StringlikeLiteral { end = start + 3 } + /** Matches the contents of a group. */ predicate groupContents(int start, int end, int inStart, int inEnd) { this.groupStart(start, inStart) and end = inEnd + 1 and @@ -747,6 +807,11 @@ abstract class RegExp extends AST::StringlikeLiteral { ) } + /** + * Holds if a repetition quantifier is found between `start` and `end`, + * with the given lower and upper bounds. If a bound is omitted, the corresponding + * string is empty. + */ predicate multiples(int start, int end, string lower, string upper) { exists(string text, string match, string inner | text = this.getText() and @@ -774,6 +839,13 @@ abstract class RegExp extends AST::StringlikeLiteral { this.qualifiedPart(start, _, end, maybeEmpty, mayRepeatForever) } + /** + * Holds if a qualified part is found between `start` and `part_end` and the qualifier is + * found between `part_end` and `end`. + * + * `maybe_empty` is true if the part is optional. + * `may_repeat_forever` is true if the part may be repeated unboundedly. + */ predicate qualifiedPart( int start, int partEnd, int end, boolean maybeEmpty, boolean mayRepeatForever ) { @@ -781,6 +853,7 @@ abstract class RegExp extends AST::StringlikeLiteral { this.qualifier(partEnd, end, maybeEmpty, mayRepeatForever) } + /** Holds if the range `start`, `end` contains a character, a quantifier, a character set or a group. */ predicate item(int start, int end) { this.qualifiedItem(start, end, _, _) or @@ -960,75 +1033,3 @@ abstract class RegExp extends AST::StringlikeLiteral { this.lastPart(start, end) } } - -private class RegExpLiteralRegExp extends RegExp, AST::RegExpLiteral { - override predicate isDotAll() { this.hasMultilineFlag() } - - override predicate isIgnoreCase() { this.hasCaseInsensitiveFlag() } - - override string getFlags() { result = this.getFlagString() } -} - -private class ParsedStringRegExp extends RegExp { - private DataFlow::Node parse; - - ParsedStringRegExp() { this = regExpSource(parse).asExpr().getExpr() } - - DataFlow::Node getAParse() { result = parse } - - override predicate isDotAll() { none() } - - override predicate isIgnoreCase() { none() } - - override string getFlags() { none() } -} - -/** - * Holds if `source` may be interpreted as a regular expression. - */ -private predicate isInterpretedAsRegExp(DataFlow::Node source) { - // The first argument to an invocation of `Regexp.new` or `Regexp.compile`. - source = API::getTopLevelMember("Regexp").getAMethodCall(["compile", "new"]).getArgument(0) - or - // The argument of a call that coerces the argument to a regular expression. - exists(DataFlow::CallNode mce | - mce.getMethodName() = ["match", "match?"] and - source = mce.getArgument(0) and - // exclude https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match - not mce.getReceiver().asExpr().getExpr() instanceof AST::RegExpLiteral - ) -} - -private class RegExpConfiguration extends Configuration { - RegExpConfiguration() { this = "RegExpConfiguration" } - - override predicate isSource(DataFlow::Node source) { - source.asExpr() = - any(ExprCfgNode e | - e.getConstantValue().isString(_) and - not e instanceof ExprNodes::VariableReadAccessCfgNode and - not e instanceof ExprNodes::ConstantReadAccessCfgNode - ) - } - - override predicate isSink(DataFlow::Node sink) { isInterpretedAsRegExp(sink) } - - override predicate isSanitizer(DataFlow::Node node) { - // stop flow if `node` is receiver of - // https://ruby-doc.org/core-2.4.0/String.html#method-i-match - exists(DataFlow::CallNode mce | - mce.getMethodName() = ["match", "match?"] and - node = mce.getReceiver() and - mce.getArgument(0).asExpr().getExpr() instanceof AST::RegExpLiteral - ) - } -} - -/** - * Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted - * as a part of a regular expression. - */ -cached -DataFlow::Node regExpSource(DataFlow::Node re) { - exists(RegExpConfiguration c | c.hasFlow(result, re)) -} diff --git a/ruby/ql/lib/codeql/ruby/security/performance/PolynomialReDoSCustomizations.qll b/ruby/ql/lib/codeql/ruby/security/performance/PolynomialReDoSCustomizations.qll index d32734eb02b..5337335af5b 100644 --- a/ruby/ql/lib/codeql/ruby/security/performance/PolynomialReDoSCustomizations.qll +++ b/ruby/ql/lib/codeql/ruby/security/performance/PolynomialReDoSCustomizations.qll @@ -8,8 +8,7 @@ private import codeql.ruby.AST as AST private import codeql.ruby.CFG private import codeql.ruby.DataFlow private import codeql.ruby.dataflow.RemoteFlowSources -private import codeql.ruby.security.performance.ParseRegExp as RegExp -private import codeql.ruby.security.performance.RegExpTreeView +private import codeql.ruby.Regexp private import codeql.ruby.security.performance.SuperlinearBackTracking module PolynomialReDoS { diff --git a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll index 55550b386cc..2efca45ff11 100644 --- a/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll +++ b/ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll @@ -1,8 +1,10 @@ -private import codeql.ruby.ast.Literal as AST -private import ParseRegExp -private import codeql.NumberUtils +/** + * This module should provide a class hierarchy corresponding to a parse tree of regular expressions. + */ + +import codeql.ruby.Regexp import codeql.Locations -private import codeql.ruby.DataFlow +private import codeql.ruby.ast.Literal as AST /** * Holds if `term` is an ecape class representing e.g. `\d`. @@ -59,776 +61,3 @@ module RegExpFlags { root.getLiteral().isDotAll() } } - -/** - * Provides utility predicates related to regular expressions. - */ -module RegExpPatterns { - /** - * Gets a pattern that matches common top-level domain names in lower case. - */ - string getACommonTld() { - // according to ranking by http://google.com/search?q=site:.<> - result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])" - } -} - -/** - * An element containing a regular expression term, that is, either - * a string literal (parsed as a regular expression) - * or another regular expression term. - */ -class RegExpParent extends TRegExpParent { - string toString() { result = "RegExpParent" } - - RegExpTerm getChild(int i) { none() } - - final RegExpTerm getAChild() { result = this.getChild(_) } - - int getNumChild() { result = count(this.getAChild()) } - - /** - * Gets the name of a primary CodeQL class to which this regular - * expression term belongs. - */ - string getAPrimaryQlClass() { result = "RegExpParent" } - - /** - * Gets a comma-separated list of the names of the primary CodeQL classes to - * which this regular expression term belongs. - */ - final string getPrimaryQlClasses() { result = concat(this.getAPrimaryQlClass(), ",") } -} - -class RegExpLiteral extends TRegExpLiteral, RegExpParent { - RegExp re; - - RegExpLiteral() { this = TRegExpLiteral(re) } - - override RegExpTerm getChild(int i) { i = 0 and result.getRegExp() = re and result.isRootTerm() } - - predicate isDotAll() { re.isDotAll() } - - predicate isIgnoreCase() { re.isIgnoreCase() } - - string getFlags() { result = re.getFlags() } - - override string getAPrimaryQlClass() { result = "RegExpLiteral" } -} - -class RegExpTerm extends RegExpParent { - RegExp re; - int start; - int end; - - RegExpTerm() { - this = TRegExpAlt(re, start, end) - or - this = TRegExpBackRef(re, start, end) - or - this = TRegExpCharacterClass(re, start, end) - or - this = TRegExpCharacterRange(re, start, end) - or - this = TRegExpNormalChar(re, start, end) - or - this = TRegExpGroup(re, start, end) - or - this = TRegExpQuantifier(re, start, end) - or - this = TRegExpSequence(re, start, end) and - exists(seqChild(re, start, end, 1)) // if a sequence does not have more than one element, it should be treated as that element instead. - or - this = TRegExpSpecialChar(re, start, end) - or - this = TRegExpNamedCharacterProperty(re, start, end) - } - - RegExpTerm getRootTerm() { - this.isRootTerm() and result = this - or - result = this.getParent().(RegExpTerm).getRootTerm() - } - - predicate isUsedAsRegExp() { any() } - - predicate isRootTerm() { start = 0 and end = re.getText().length() } - - override RegExpTerm getChild(int i) { - result = this.(RegExpAlt).getChild(i) - or - result = this.(RegExpBackRef).getChild(i) - or - result = this.(RegExpCharacterClass).getChild(i) - or - result = this.(RegExpCharacterRange).getChild(i) - or - result = this.(RegExpNormalChar).getChild(i) - or - result = this.(RegExpGroup).getChild(i) - or - result = this.(RegExpQuantifier).getChild(i) - or - result = this.(RegExpSequence).getChild(i) - or - result = this.(RegExpSpecialChar).getChild(i) - or - result = this.(RegExpNamedCharacterProperty).getChild(i) - } - - RegExpParent getParent() { result.getAChild() = this } - - RegExp getRegExp() { result = re } - - int getStart() { result = start } - - int getEnd() { result = end } - - override string toString() { result = re.getText().substring(start, end) } - - override string getAPrimaryQlClass() { result = "RegExpTerm" } - - Location getLocation() { result = re.getLocation() } - - pragma[noinline] - private predicate componentHasLocationInfo( - int i, string filepath, int startline, int startcolumn, int endline, int endcolumn - ) { - re.getComponent(i) - .getLocation() - .hasLocationInfo(filepath, startline, startcolumn, endline, endcolumn) - } - - predicate hasLocationInfo( - string filepath, int startline, int startcolumn, int endline, int endcolumn - ) { - exists(int re_start, int re_end | - this.componentHasLocationInfo(0, filepath, startline, re_start, _, _) and - this.componentHasLocationInfo(re.getNumberOfComponents() - 1, filepath, _, _, endline, re_end) and - startcolumn = re_start + start and - endcolumn = re_start + end - 1 - ) - } - - File getFile() { result = this.getLocation().getFile() } - - string getRawValue() { result = this.toString() } - - RegExpLiteral getLiteral() { result = TRegExpLiteral(re) } - - /** Gets the regular expression term that is matched (textually) before this one, if any. */ - RegExpTerm getPredecessor() { - exists(RegExpTerm parent | parent = this.getParent() | - result = parent.(RegExpSequence).previousElement(this) - or - not exists(parent.(RegExpSequence).previousElement(this)) and - not parent instanceof RegExpSubPattern and - result = parent.getPredecessor() - ) - } - - /** Gets the regular expression term that is matched (textually) after this one, if any. */ - RegExpTerm getSuccessor() { - exists(RegExpTerm parent | parent = this.getParent() | - result = parent.(RegExpSequence).nextElement(this) - or - not exists(parent.(RegExpSequence).nextElement(this)) and - not parent instanceof RegExpSubPattern and - result = parent.getSuccessor() - ) - } -} - -newtype TRegExpParent = - TRegExpLiteral(RegExp re) or - TRegExpQuantifier(RegExp re, int start, int end) { re.qualifiedItem(start, end, _, _) } or - TRegExpSequence(RegExp re, int start, int end) { re.sequence(start, end) } or - TRegExpAlt(RegExp re, int start, int end) { re.alternation(start, end) } or - TRegExpCharacterClass(RegExp re, int start, int end) { re.charSet(start, end) } or - TRegExpCharacterRange(RegExp re, int start, int end) { re.charRange(_, start, _, _, end) } or - TRegExpGroup(RegExp re, int start, int end) { re.group(start, end) } or - TRegExpSpecialChar(RegExp re, int start, int end) { re.specialCharacter(start, end, _) } or - TRegExpNormalChar(RegExp re, int start, int end) { - re.normalCharacterSequence(start, end) - or - re.escapedCharacter(start, end) and - not re.specialCharacter(start, end, _) - } or - TRegExpBackRef(RegExp re, int start, int end) { re.backreference(start, end) } or - TRegExpNamedCharacterProperty(RegExp re, int start, int end) { - re.namedCharacterProperty(start, end, _) - } - -class RegExpQuantifier extends RegExpTerm, TRegExpQuantifier { - int part_end; - boolean may_repeat_forever; - - RegExpQuantifier() { - this = TRegExpQuantifier(re, start, end) and - re.qualifiedPart(start, part_end, end, _, may_repeat_forever) - } - - override RegExpTerm getChild(int i) { - i = 0 and - result.getRegExp() = re and - result.getStart() = start and - result.getEnd() = part_end - } - - predicate mayRepeatForever() { may_repeat_forever = true } - - string getQualifier() { result = re.getText().substring(part_end, end) } - - override string getAPrimaryQlClass() { result = "RegExpQuantifier" } -} - -class InfiniteRepetitionQuantifier extends RegExpQuantifier { - InfiniteRepetitionQuantifier() { this.mayRepeatForever() } - - override string getAPrimaryQlClass() { result = "InfiniteRepetitionQuantifier" } -} - -class RegExpStar extends InfiniteRepetitionQuantifier { - RegExpStar() { this.getQualifier().charAt(0) = "*" } - - override string getAPrimaryQlClass() { result = "RegExpStar" } -} - -class RegExpPlus extends InfiniteRepetitionQuantifier { - RegExpPlus() { this.getQualifier().charAt(0) = "+" } - - override string getAPrimaryQlClass() { result = "RegExpPlus" } -} - -class RegExpOpt extends RegExpQuantifier { - RegExpOpt() { this.getQualifier().charAt(0) = "?" } - - override string getAPrimaryQlClass() { result = "RegExpOpt" } -} - -class RegExpRange extends RegExpQuantifier { - string upper; - string lower; - - RegExpRange() { re.multiples(part_end, end, lower, upper) } - - string getUpper() { result = upper } - - string getLower() { result = lower } - - /** - * Gets the upper bound of the range, if any. - * - * If there is no upper bound, any number of repetitions is allowed. - * For a term of the form `r{lo}`, both the lower and the upper bound - * are `lo`. - */ - int getUpperBound() { result = this.getUpper().toInt() } - - /** Gets the lower bound of the range. */ - int getLowerBound() { result = this.getLower().toInt() } - - override string getAPrimaryQlClass() { result = "RegExpRange" } -} - -class RegExpSequence extends RegExpTerm, TRegExpSequence { - RegExpSequence() { - this = TRegExpSequence(re, start, end) and - exists(seqChild(re, start, end, 1)) // if a sequence does not have more than one element, it should be treated as that element instead. - } - - override RegExpTerm getChild(int i) { result = seqChild(re, start, end, i) } - - /** Gets the element preceding `element` in this sequence. */ - RegExpTerm previousElement(RegExpTerm element) { element = this.nextElement(result) } - - /** Gets the element following `element` in this sequence. */ - RegExpTerm nextElement(RegExpTerm element) { - exists(int i | - element = this.getChild(i) and - result = this.getChild(i + 1) - ) - } - - override string getAPrimaryQlClass() { result = "RegExpSequence" } -} - -pragma[nomagic] -private int seqChildEnd(RegExp re, int start, int end, int i) { - result = seqChild(re, start, end, i).getEnd() -} - -// moved out so we can use it in the charpred -private RegExpTerm seqChild(RegExp re, int start, int end, int i) { - re.sequence(start, end) and - ( - i = 0 and - result.getRegExp() = re and - result.getStart() = start and - exists(int itemEnd | - re.item(start, itemEnd) and - result.getEnd() = itemEnd - ) - or - i > 0 and - result.getRegExp() = re and - exists(int itemStart | itemStart = seqChildEnd(re, start, end, i - 1) | - result.getStart() = itemStart and - re.item(itemStart, result.getEnd()) - ) - ) -} - -class RegExpAlt extends RegExpTerm, TRegExpAlt { - RegExpAlt() { this = TRegExpAlt(re, start, end) } - - override RegExpTerm getChild(int i) { - i = 0 and - result.getRegExp() = re and - result.getStart() = start and - exists(int part_end | - re.alternationOption(start, end, start, part_end) and - result.getEnd() = part_end - ) - or - i > 0 and - result.getRegExp() = re and - exists(int part_start | - part_start = this.getChild(i - 1).getEnd() + 1 // allow for the | - | - result.getStart() = part_start and - re.alternationOption(start, end, part_start, result.getEnd()) - ) - } - - override string getAPrimaryQlClass() { result = "RegExpAlt" } -} - -class RegExpCharEscape = RegExpEscape; - -class RegExpEscape extends RegExpNormalChar { - RegExpEscape() { re.escapedCharacter(start, end) } - - /** - * Gets the name of the escaped; for example, `w` for `\w`. - * TODO: Handle named escapes. - */ - override string getValue() { - this.isIdentityEscape() and result = this.getUnescaped() - or - this.getUnescaped() = "n" and result = "\n" - or - this.getUnescaped() = "r" and result = "\r" - or - this.getUnescaped() = "t" and result = "\t" - or - this.isUnicode() and - result = this.getUnicode() - } - - predicate isIdentityEscape() { - not this.getUnescaped() in ["n", "r", "t"] and not this.isUnicode() - } - - /** - * Gets the text for this escape. That is e.g. "\w". - */ - private string getText() { result = re.getText().substring(start, end) } - - /** - * Holds if this is a unicode escape. - */ - private predicate isUnicode() { this.getText().prefix(2) = ["\\u", "\\U"] } - - /** - * Gets the unicode char for this escape. - * E.g. for `\u0061` this returns "a". - */ - private string getUnicode() { - this.isUnicode() and - result = parseHexInt(this.getText().suffix(2)).toUnicode() - } - - string getUnescaped() { result = this.getText().suffix(1) } - - override string getAPrimaryQlClass() { result = "RegExpEscape" } -} - -/** - * A word boundary, that is, a regular expression term of the form `\b`. - */ -class RegExpWordBoundary extends RegExpSpecialChar { - RegExpWordBoundary() { this.getChar() = "\\b" } -} - -/** - * A character class escape in a regular expression. - * That is, an escaped character that denotes multiple characters. - * - * Examples: - * - * ``` - * \w - * \S - * ``` - */ -class RegExpCharacterClassEscape extends RegExpEscape { - RegExpCharacterClassEscape() { this.getValue() in ["d", "D", "s", "S", "w", "W", "h", "H"] } - - /** Gets the name of the character class; for example, `w` for `\w`. */ - // override string getValue() { result = value } - override RegExpTerm getChild(int i) { none() } - - override string getAPrimaryQlClass() { result = "RegExpCharacterClassEscape" } -} - -/** - * A character class. - * - * Examples: - * - * ```rb - * /[a-fA-F0-9]/ - * /[^abc]/ - * ``` - */ -class RegExpCharacterClass extends RegExpTerm, TRegExpCharacterClass { - RegExpCharacterClass() { this = TRegExpCharacterClass(re, start, end) } - - predicate isInverted() { re.getChar(start + 1) = "^" } - - predicate isUniversalClass() { - // [^] - this.isInverted() and not exists(this.getAChild()) - or - // [\w\W] and similar - not this.isInverted() and - exists(string cce1, string cce2 | - cce1 = this.getAChild().(RegExpCharacterClassEscape).getValue() and - cce2 = this.getAChild().(RegExpCharacterClassEscape).getValue() - | - cce1 != cce2 and cce1.toLowerCase() = cce2.toLowerCase() - ) - } - - override RegExpTerm getChild(int i) { - i = 0 and - result.getRegExp() = re and - exists(int itemStart, int itemEnd | - result.getStart() = itemStart and - re.charSetStart(start, itemStart) and - re.charSetChild(start, itemStart, itemEnd) and - result.getEnd() = itemEnd - ) - or - i > 0 and - result.getRegExp() = re and - exists(int itemStart | itemStart = this.getChild(i - 1).getEnd() | - result.getStart() = itemStart and - re.charSetChild(start, itemStart, result.getEnd()) - ) - } - - override string getAPrimaryQlClass() { result = "RegExpCharacterClass" } -} - -class RegExpCharacterRange extends RegExpTerm, TRegExpCharacterRange { - int lower_end; - int upper_start; - - RegExpCharacterRange() { - this = TRegExpCharacterRange(re, start, end) and - re.charRange(_, start, lower_end, upper_start, end) - } - - predicate isRange(string lo, string hi) { - lo = re.getText().substring(start, lower_end) and - hi = re.getText().substring(upper_start, end) - } - - override RegExpTerm getChild(int i) { - i = 0 and - result.getRegExp() = re and - result.getStart() = start and - result.getEnd() = lower_end - or - i = 1 and - result.getRegExp() = re and - result.getStart() = upper_start and - result.getEnd() = end - } - - override string getAPrimaryQlClass() { result = "RegExpCharacterRange" } -} - -class RegExpNormalChar extends RegExpTerm, TRegExpNormalChar { - RegExpNormalChar() { this = TRegExpNormalChar(re, start, end) } - - predicate isCharacter() { any() } - - string getValue() { result = re.getText().substring(start, end) } - - override RegExpTerm getChild(int i) { none() } - - override string getAPrimaryQlClass() { result = "RegExpNormalChar" } -} - -class RegExpConstant extends RegExpTerm { - string value; - - RegExpConstant() { - this = TRegExpNormalChar(re, start, end) and - not this instanceof RegExpCharacterClassEscape and - // exclude chars in qualifiers - // TODO: push this into regex library - not exists(int qstart, int qend | re.qualifiedPart(_, qstart, qend, _, _) | - qstart <= start and end <= qend - ) and - value = this.(RegExpNormalChar).getValue() - or - this = TRegExpSpecialChar(re, start, end) and - re.inCharSet(start) and - value = this.(RegExpSpecialChar).getChar() - } - - predicate isCharacter() { any() } - - string getValue() { result = value } - - override RegExpTerm getChild(int i) { none() } - - override string getAPrimaryQlClass() { result = "RegExpConstant" } -} - -class RegExpGroup extends RegExpTerm, TRegExpGroup { - RegExpGroup() { this = TRegExpGroup(re, start, end) } - - /** - * Gets the index of this capture group within the enclosing regular - * expression literal. - * - * For example, in the regular expression `/((a?).)(?:b)/`, the - * group `((a?).)` has index 1, the group `(a?)` nested inside it - * has index 2, and the group `(?:b)` has no index, since it is - * not a capture group. - */ - int getNumber() { result = re.getGroupNumber(start, end) } - - /** Holds if this is a capture group. */ - predicate isCapture() { exists(this.getNumber()) } - - /** Holds if this is a named capture group. */ - predicate isNamed() { exists(this.getName()) } - - /** Gets the name of this capture group, if any. */ - string getName() { result = re.getGroupName(start, end) } - - predicate isCharacter() { any() } - - string getValue() { result = re.getText().substring(start, end) } - - override RegExpTerm getChild(int i) { - result.getRegExp() = re and - i = 0 and - re.groupContents(start, end, result.getStart(), result.getEnd()) - } - - override string getAPrimaryQlClass() { result = "RegExpGroup" } -} - -class RegExpSpecialChar extends RegExpTerm, TRegExpSpecialChar { - string char; - - RegExpSpecialChar() { - this = TRegExpSpecialChar(re, start, end) and - re.specialCharacter(start, end, char) - } - - predicate isCharacter() { any() } - - string getChar() { result = char } - - override RegExpTerm getChild(int i) { none() } - - override string getAPrimaryQlClass() { result = "RegExpSpecialChar" } -} - -class RegExpDot extends RegExpSpecialChar { - RegExpDot() { this.getChar() = "." } - - override string getAPrimaryQlClass() { result = "RegExpDot" } -} - -class RegExpDollar extends RegExpSpecialChar { - RegExpDollar() { this.getChar() = ["$", "\\Z", "\\z"] } - - override string getAPrimaryQlClass() { result = "RegExpDollar" } -} - -class RegExpCaret extends RegExpSpecialChar { - RegExpCaret() { this.getChar() = ["^", "\\A"] } - - override string getAPrimaryQlClass() { result = "RegExpCaret" } -} - -class RegExpZeroWidthMatch extends RegExpGroup { - RegExpZeroWidthMatch() { re.zeroWidthMatch(start, end) } - - override predicate isCharacter() { any() } - - override RegExpTerm getChild(int i) { none() } - - override string getAPrimaryQlClass() { result = "RegExpZeroWidthMatch" } -} - -/** - * A zero-width lookahead or lookbehind assertion. - * - * Examples: - * - * ``` - * (?=\w) - * (?!\n) - * (?<=\.) - * (?