Merge pull request #8293 from aibaars/regex-pattern-source

Ruby: parse more string literals as regular expressions
This commit is contained in:
Arthur Baars
2022-03-03 17:35:40 +01:00
committed by GitHub
6 changed files with 141 additions and 7 deletions

View File

@@ -0,0 +1,4 @@
---
category: minorAnalysis
---
* The `Regex` class is now an abstract class that extends `StringlikeLiteral` with implementations for `RegExpLiteral` and string literals that 'flow' into functions that are known to interpret string arguments as regular expressions such as `Regex.new` and `String.match`.

View File

@@ -7,8 +7,32 @@
private import codeql.ruby.ast.Literal as AST
private import codeql.Locations
private import codeql.ruby.DataFlow
private import codeql.ruby.TaintTracking
private import codeql.ruby.typetracking.TypeTracker
private import codeql.ruby.ApiGraphs
/**
* A `StringlikeLiteral` containing a regular expression term, that is, either
* a regular expression literal, or a string literal used in a context where
* it is parsed as regular expression.
*/
abstract class RegExp extends AST::StringlikeLiteral {
/**
* Holds if this `RegExp` has the `s` flag for multi-line matching.
*/
predicate isDotAll() { none() }
/**
* Holds if this `RegExp` has the `i` flag for case-insensitive matching.
*/
predicate isIgnoreCase() { none() }
/**
* Gets the flags for this `RegExp`, or the empty string if it has no flags.
*/
string getFlags() { result = "" }
class RegExp extends AST::RegExpLiteral {
/**
* Helper predicate for `charSetStart(int start, int end)`.
*
@@ -933,3 +957,63 @@ class RegExp extends AST::RegExpLiteral {
this.lastPart(start, end)
}
}
private class RegExpLiteralRegExp extends RegExp, AST::RegExpLiteral {
override predicate isDotAll() { this.hasMultilineFlag() }
override predicate isIgnoreCase() { this.hasCaseInsensitiveFlag() }
override string getFlags() { result = this.getFlagString() }
}
private class ParsedStringRegExp extends RegExp {
private DataFlow::Node parse;
ParsedStringRegExp() { this = regExpSource(parse).asExpr().getExpr() }
DataFlow::Node getAParse() { result = parse }
override predicate isDotAll() { none() }
override predicate isIgnoreCase() { none() }
override string getFlags() { none() }
}
/**
* Holds if `source` may be interpreted as a regular expression.
*/
cached
private predicate isInterpretedAsRegExp(DataFlow::Node source) {
// The first argument to an invocation of `Regexp.new` or `Regexp.compile`.
source = API::getTopLevelMember("Regexp").getAMethodCall(["compile", "new"]).getArgument(0)
or
// The argument of a call that coerces the argument to a regular expression.
exists(DataFlow::CallNode mce |
mce.getMethodName() = ["match", "match?"] and
source = mce.getArgument(0)
)
}
/**
* Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted
* as a part of a regular expression.
*/
private DataFlow::Node regExpSource(DataFlow::Node re, TypeBackTracker t) {
t.start() and
re = result and
isInterpretedAsRegExp(result)
or
exists(TypeBackTracker t2, DataFlow::Node succ | succ = regExpSource(re, t2) |
t2 = t.smallstep(result, succ)
or
TaintTracking::localTaintStep(result, succ) and
t = t2
)
}
/**
* Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted
* as a part of a regular expression.
*/
DataFlow::Node regExpSource(DataFlow::Node re) { result = regExpSource(re, TypeBackTracker::end()) }

View File

@@ -1,7 +1,7 @@
private import codeql.ruby.ast.Literal as AST
private import codeql.Locations
private import ParseRegExp
import codeql.Locations
private import codeql.ruby.DataFlow
/**
* Holds if `term` is an ecape class representing e.g. `\d`.
@@ -27,7 +27,7 @@ predicate isEscapeClass(RegExpTerm term, string clazz) {
* Holds if the regular expression should not be considered.
*/
predicate isExcluded(RegExpParent parent) {
parent.(RegExpTerm).getRegExp().hasFreeSpacingFlag() // exclude free-spacing mode regexes
parent.(RegExpTerm).getRegExp().(AST::RegExpLiteral).hasFreeSpacingFlag() // exclude free-spacing mode regexes
}
/**
@@ -93,11 +93,11 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent {
override RegExpTerm getChild(int i) { i = 0 and result.getRegExp() = re and result.isRootTerm() }
predicate isDotAll() { re.hasMultilineFlag() }
predicate isDotAll() { re.isDotAll() }
predicate isIgnoreCase() { re.hasCaseInsensitiveFlag() }
predicate isIgnoreCase() { re.isIgnoreCase() }
string getFlags() { result = re.getFlagString() }
string getFlags() { result = re.getFlags() }
override string getAPrimaryQlClass() { result = "RegExpLiteral" }
}
@@ -795,3 +795,47 @@ class RegExpNamedCharacterProperty extends RegExpTerm, TRegExpNamedCharacterProp
RegExpTerm getParsedRegExp(AST::RegExpLiteral re) {
result.getRegExp() = re and result.isRootTerm()
}
/**
* A node whose value may flow to a position where it is interpreted
* as a part of a regular expression.
*/
abstract class RegExpPatternSource extends DataFlow::Node {
/**
* Gets a node where the pattern of this node is parsed as a part of
* a regular expression.
*/
abstract DataFlow::Node getAParse();
/**
* Gets the root term of the regular expression parsed from this pattern.
*/
abstract RegExpTerm getRegExpTerm();
}
/**
* A regular expression literal, viewed as the pattern source for itself.
*/
private class RegExpLiteralPatternSource extends RegExpPatternSource {
private AST::RegExpLiteral astNode;
RegExpLiteralPatternSource() { astNode = this.asExpr().getExpr() }
override DataFlow::Node getAParse() { result = this }
override RegExpTerm getRegExpTerm() { result = astNode.getParsed() }
}
/**
* A node whose string value may flow to a position where it is interpreted
* as a part of a regular expression.
*/
private class StringRegExpPatternSource extends RegExpPatternSource {
private DataFlow::Node parse;
StringRegExpPatternSource() { this = regExpSource(parse) }
override DataFlow::Node getAParse() { result = parse }
override RegExpTerm getRegExpTerm() { result.getRegExp() = this.asExpr().getExpr() }
}

View File

@@ -20,6 +20,7 @@
| tst.rb:74:10:74:17 | (b\|a?b)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'b'. |
| tst.rb:77:10:77:17 | (a\|aa?)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
| tst.rb:83:10:83:16 | (.\|\\n)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
| tst.rb:89:21:89:28 | (a\|aa?)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
| tst.rb:95:11:95:24 | ([\\S\\s]\|[^a])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '`'. |
| tst.rb:101:11:101:19 | (.\|[^a])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '`'. |
| tst.rb:107:11:107:19 | (b\|[^a])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'b'. |

View File

@@ -85,7 +85,7 @@ bad16 = /(.|\n)*!/m
# GOOD
good8 = /([\w.]+)*/
# BAD - we don't yet parse regexps constructed from strings
# NOT GOOD
bad17 = Regexp.new '(a|aa?)*b'
# GOOD - not used as regexp