add a RegexExecution, and use it to track regular expressions to their uses in a nice way in rb/polynomial-redos

This commit is contained in:
erik-krogh
2023-01-12 21:34:40 +01:00
parent 6e33dd5df6
commit acf28ebd98
5 changed files with 204 additions and 82 deletions

View File

@@ -10,6 +10,7 @@ private import codeql.ruby.DataFlow
private import codeql.ruby.Frameworks
private import codeql.ruby.dataflow.RemoteFlowSources
private import codeql.ruby.ApiGraphs
private import codeql.ruby.Regexp as RE
/**
* A data-flow node that constructs a SQL statement.
@@ -77,6 +78,55 @@ module SqlExecution {
}
}
/**
* A data-flow node that executes a regular expression.
*
* Extend this class to refine existing API models. If you want to model new APIs,
* extend `RegexExecution::Range` instead.
*/
class RegexExecution extends DataFlow::Node instanceof RegexExecution::Range {
/** Gets the data flow node for the regex being executed by this node. */
DataFlow::Node getRegex() { result = super.getRegex() }
/** Gets a dataflow node for the string to be searched or matched against. */
DataFlow::Node getString() { result = super.getString() }
/** Gets a parsed regular expression term that is executed at this node. */
RE::RegExpTerm getTerm() { result = super.getTerm() }
/**
* Gets the name of this regex execution, typically the name of an executing method.
* This is used for nice alert messages and should include the module if possible.
*/
string getName() { result = super.getName() }
}
/** Provides classes for modeling new regular-expression execution APIs. */
module RegexExecution {
/**
* A data-flow node that executes a regular expression.
*
* Extend this class to model new APIs. If you want to refine existing API models,
* extend `RegexExecution` instead.
*/
abstract class Range extends DataFlow::Node {
/** Gets the data flow node for the regex being executed by this node. */
abstract DataFlow::Node getRegex();
/** Gets a dataflow node for the string to be searched or matched against. */
abstract DataFlow::Node getString();
/** Gets the parsed regular expression term that is executed by this node. */
abstract RE::RegExpTerm getTerm();
/**
* Gets the name of this regex execution, typically the name of an executing method.
* This is used for nice alert messages and should include the module if possible.
*/
abstract string getName();
}
}
/**
* A data flow node that performs a file system access, including reading and writing data,
* creating and deleting files and folders, checking and updating permissions, and so on.

View File

@@ -8,9 +8,11 @@
import regexp.RegExpTreeView // re-export
private import regexp.internal.ParseRegExp
private import regexp.internal.RegExpConfiguration
private import codeql.ruby.ast.Literal as Ast
private import codeql.ruby.AST as Ast
private import codeql.ruby.CFG
private import codeql.ruby.DataFlow
private import codeql.ruby.ApiGraphs
private import codeql.ruby.Concepts
/**
* Provides utility predicates related to regular expressions.
@@ -63,7 +65,11 @@ private class RegExpLiteralPatternSource extends RegExpPatternSource {
private class StringRegExpPatternSource extends RegExpPatternSource {
private DataFlow::Node parse;
StringRegExpPatternSource() { this = regExpSource(parse) }
StringRegExpPatternSource() {
this = regExpSource(parse) and
// `regExpSource()` tracks both strings and regex literals, narrow it down to strings.
this.asExpr().getConstantValue().isString(_)
}
override DataFlow::Node getAParse() { result = parse }
@@ -104,6 +110,7 @@ module RegExpInterpretation {
/**
* A node interpreted as a regular expression.
* Speficically nodes where string values are interpreted as regular expressions.
*/
class StdLibRegExpInterpretation extends RegExpInterpretation::Range {
StdLibRegExpInterpretation() {
@@ -128,3 +135,87 @@ cached
DataFlow::Node regExpSource(DataFlow::Node re) {
exists(RegExpConfiguration c | c.hasFlow(result, re))
}
/**
* Holds if `exec` is a node where `regexp` is interpreted as a regular expression and
* tested against the string value of `input`.
* `name` describes the regexp execution, typically the name of the method being called.
*/
private predicate regexExecution(
DataFlow::Node exec, DataFlow::Node input, DataFlow::Node regexp, string name
) {
// `=~` or `!~`
exists(CfgNodes::ExprNodes::BinaryOperationCfgNode op |
name = op.getOperator() and
exec.asExpr() = op and
(
op.getExpr() instanceof Ast::RegExpMatchExpr or
op.getExpr() instanceof Ast::NoRegExpMatchExpr
) and
(
input.asExpr() = op.getLeftOperand() and regexp.asExpr() = op.getRightOperand()
or
input.asExpr() = op.getRightOperand() and regexp.asExpr() = op.getLeftOperand()
)
)
or
// Any of the methods on `String` that take a regexp.
exists(DataFlow::CallNode call | exec = call |
name = "String#" + call.getMethodName() and
call.getMethodName() =
[
"[]", "gsub", "gsub!", "index", "match", "match?", "partition", "rindex", "rpartition",
"scan", "slice!", "split", "sub", "sub!"
] and
input = call.getReceiver() and
regexp = call.getArgument(0) and
// exclude https://ruby-doc.org/core-2.4.0/Regexp.html#method-i-match, they are handled on the next case of this disjunction
// also see `StdLibRegExpInterpretation`
not (
call.getMethodName() = ["match", "match?"] and
call.getReceiver() = trackRegexpType()
)
)
or
// A call to `match` or `match?` where the regexp is the receiver.
exists(DataFlow::CallNode call | exec = call |
name = "Regexp#" + call.getMethodName() and
call.getMethodName() = ["match", "match?"] and
regexp = call.getReceiver() and
input = call.getArgument(0)
)
or
// a case-when statement
exists(CfgNodes::ExprNodes::CaseExprCfgNode caseWhen |
name = "case-when" and
exec.asExpr() = caseWhen and
input.asExpr() = caseWhen.getValue()
|
regexp.asExpr() = caseWhen.getBranch(_).(CfgNodes::ExprNodes::WhenClauseCfgNode).getPattern(_)
or
regexp.asExpr() = caseWhen.getBranch(_).(CfgNodes::ExprNodes::InClauseCfgNode).getPattern()
)
}
/**
* An execution of a regular expression by the standard library.
*/
private class StdRegexpExecution extends RegexExecution::Range {
DataFlow::Node regexp;
DataFlow::Node input;
string name;
StdRegexpExecution() { regexExecution(this, input, regexp, name) }
override DataFlow::Node getRegex() { result = regexp }
override DataFlow::Node getString() { result = input }
override RegExpTerm getTerm() { result = getTermForNode(regexp) }
override string getName() { result = name }
}
private RegExpTerm getTermForNode(DataFlow::Node node) {
exists(RegExpPatternSource source | source = regExpSource(node) | result = source.getRegExpTerm())
}

View File

@@ -6,6 +6,7 @@ private import codeql.ruby.controlflow.CfgNodes
private import codeql.ruby.dataflow.internal.DataFlowImplForRegExp
private import codeql.ruby.typetracking.TypeTracker
private import codeql.ruby.ApiGraphs
private import codeql.ruby.Concepts
private import codeql.ruby.dataflow.internal.DataFlowPrivate as DataFlowPrivate
private import codeql.ruby.TaintTracking
private import codeql.ruby.frameworks.core.String
@@ -13,18 +14,30 @@ private import codeql.ruby.frameworks.core.String
class RegExpConfiguration extends Configuration {
RegExpConfiguration() { this = "RegExpConfiguration" }
override predicate isSource(DataFlow::Node source) {
override predicate isSource(DataFlow::Node source, DataFlow::FlowState state) {
// track both string literals and regexp literals - the latter for finding executions of regular expressions that are used elsewhere.
state = "string" and
source.asExpr() =
any(ExprCfgNode e |
e.getConstantValue().isString(_) and
not e instanceof ExprNodes::VariableReadAccessCfgNode and
not e instanceof ExprNodes::ConstantReadAccessCfgNode
)
or
state = "reg" and
source.asExpr().getExpr() instanceof Ast::RegExpLiteral
}
override predicate isSink(DataFlow::Node sink) { sink instanceof RegExpInterpretation::Range }
override predicate isSink(DataFlow::Node sink, DataFlow::FlowState state) {
state = "string" and
sink instanceof RegExpInterpretation::Range
or
state = "reg" and
sink = any(RegexExecution exec).getRegex()
}
override predicate isBarrier(DataFlow::Node node) {
override predicate isBarrier(DataFlow::Node node, DataFlow::FlowState state) {
state = "string" and
exists(DataFlow::CallNode mce | mce.getMethodName() = ["match", "match?"] |
// receiver of https://ruby-doc.org/core-2.4.0/String.html#method-i-match
node = mce.getReceiver() and
@@ -36,22 +49,29 @@ class RegExpConfiguration extends Configuration {
)
}
override predicate isAdditionalFlowStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
// include taint flow through `String` summaries
TaintTracking::localTaintStep(nodeFrom, nodeTo) and
nodeFrom.(DataFlowPrivate::SummaryNode).getSummarizedCallable() instanceof
String::SummarizedCallable
or
// string concatenations, and
exists(CfgNodes::ExprNodes::OperationCfgNode op |
op = nodeTo.asExpr() and
op.getAnOperand() = nodeFrom.asExpr() and
op.getExpr().(Ast::BinaryOperation).getOperator() = "+"
override predicate isAdditionalFlowStep(
DataFlow::Node nodeFrom, DataFlow::FlowState stateFrom, DataFlow::Node nodeTo,
DataFlow::FlowState stateTo
) {
stateFrom = stateTo and
stateFrom = "string" and
(
// include taint flow through `String` summaries
TaintTracking::localTaintStep(nodeFrom, nodeTo) and
nodeFrom.(DataFlowPrivate::SummaryNode).getSummarizedCallable() instanceof
String::SummarizedCallable
or
// string concatenations, and
exists(CfgNodes::ExprNodes::OperationCfgNode op |
op = nodeTo.asExpr() and
op.getAnOperand() = nodeFrom.asExpr() and
op.getExpr().(Ast::BinaryOperation).getOperator() = "+"
)
or
// string interpolations
nodeFrom.asExpr() =
nodeTo.asExpr().(CfgNodes::ExprNodes::StringlikeLiteralCfgNode).getAComponent()
)
or
// string interpolations
nodeFrom.asExpr() =
nodeTo.asExpr().(CfgNodes::ExprNodes::StringlikeLiteralCfgNode).getAComponent()
}
}

View File

@@ -9,6 +9,8 @@ private import codeql.ruby.CFG
private import codeql.ruby.DataFlow
private import codeql.ruby.dataflow.RemoteFlowSources
private import codeql.ruby.regexp.RegExpTreeView::RegexTreeView as TreeView
private import codeql.ruby.Regexp as RE
private import codeql.ruby.Concepts
/**
* Provides default sources, sinks and sanitizers for reasoning about
@@ -53,80 +55,22 @@ module PolynomialReDoS {
*/
class RemoteFlowSourceAsSource extends Source, RemoteFlowSource { }
/**
* Gets the AST of a regular expression object that can flow to `node`.
*/
RegExpTerm getRegExpObjectFromNode(DataFlow::Node node) {
exists(DataFlow::LocalSourceNode regexp |
regexp.flowsTo(node) and
result = regexp.asExpr().(CfgNodes::ExprNodes::RegExpLiteralCfgNode).getExpr().getParsed()
)
}
/**
* A regexp match against a superlinear backtracking term, seen as a sink for
* polynomial regular expression denial-of-service vulnerabilities.
*/
class PolynomialBackTrackingTermMatch extends Sink {
PolynomialBackTrackingTerm term;
DataFlow::ExprNode matchNode;
RegexExecution exec;
PolynomialBackTrackingTermMatch() {
exists(DataFlow::Node regexp |
term.getRootTerm() = getRegExpObjectFromNode(regexp) and
(
// `=~` or `!~`
exists(CfgNodes::ExprNodes::BinaryOperationCfgNode op |
matchNode.asExpr() = op and
(
op.getExpr() instanceof Ast::RegExpMatchExpr or
op.getExpr() instanceof Ast::NoRegExpMatchExpr
) and
(
this.asExpr() = op.getLeftOperand() and regexp.asExpr() = op.getRightOperand()
or
this.asExpr() = op.getRightOperand() and regexp.asExpr() = op.getLeftOperand()
)
)
or
// Any of the methods on `String` that take a regexp.
exists(CfgNodes::ExprNodes::MethodCallCfgNode call |
matchNode.asExpr() = call and
call.getExpr().getMethodName() =
[
"[]", "gsub", "gsub!", "index", "match", "match?", "partition", "rindex",
"rpartition", "scan", "slice!", "split", "sub", "sub!"
] and
this.asExpr() = call.getReceiver() and
regexp.asExpr() = call.getArgument(0)
)
or
// A call to `match` or `match?` where the regexp is the receiver.
exists(CfgNodes::ExprNodes::MethodCallCfgNode call |
matchNode.asExpr() = call and
call.getExpr().getMethodName() = ["match", "match?"] and
regexp.asExpr() = call.getReceiver() and
this.asExpr() = call.getArgument(0)
)
or
// a case-when statement
exists(CfgNodes::ExprNodes::CaseExprCfgNode caseWhen |
matchNode.asExpr() = caseWhen and
this.asExpr() = caseWhen.getValue()
|
regexp.asExpr() =
caseWhen.getBranch(_).(CfgNodes::ExprNodes::WhenClauseCfgNode).getPattern(_)
or
regexp.asExpr() =
caseWhen.getBranch(_).(CfgNodes::ExprNodes::InClauseCfgNode).getPattern()
)
)
)
term.getRootTerm() = exec.getTerm() and
this = exec.getString()
}
override RegExpTerm getRegExp() { result = term }
override DataFlow::Node getHighlight() { result = matchNode }
override DataFlow::Node getHighlight() { result = exec }
}
private predicate lengthGuard(CfgNodes::AstCfgNode g, CfgNode node, boolean branch) {

View File

@@ -23,6 +23,13 @@ edges
| PolynomialReDoS.rb:29:9:29:18 | ...[...] : | PolynomialReDoS.rb:30:5:30:5 | b |
| PolynomialReDoS.rb:31:9:31:14 | call to params : | PolynomialReDoS.rb:31:9:31:18 | ...[...] : |
| PolynomialReDoS.rb:31:9:31:18 | ...[...] : | PolynomialReDoS.rb:32:5:32:5 | c |
| PolynomialReDoS.rb:54:12:54:17 | call to params : | PolynomialReDoS.rb:54:12:54:24 | ...[...] : |
| PolynomialReDoS.rb:54:12:54:24 | ...[...] : | PolynomialReDoS.rb:56:38:56:41 | name : |
| PolynomialReDoS.rb:54:12:54:24 | ...[...] : | PolynomialReDoS.rb:58:37:58:40 | name : |
| PolynomialReDoS.rb:56:38:56:41 | name : | PolynomialReDoS.rb:61:33:61:37 | input : |
| PolynomialReDoS.rb:58:37:58:40 | name : | PolynomialReDoS.rb:65:42:65:46 | input : |
| PolynomialReDoS.rb:61:33:61:37 | input : | PolynomialReDoS.rb:62:5:62:9 | input |
| PolynomialReDoS.rb:65:42:65:46 | input : | PolynomialReDoS.rb:66:5:66:9 | input |
nodes
| PolynomialReDoS.rb:4:12:4:17 | call to params : | semmle.label | call to params : |
| PolynomialReDoS.rb:4:12:4:24 | ...[...] : | semmle.label | ...[...] : |
@@ -52,6 +59,14 @@ nodes
| PolynomialReDoS.rb:32:5:32:5 | c | semmle.label | c |
| PolynomialReDoS.rb:42:10:42:13 | name | semmle.label | name |
| PolynomialReDoS.rb:47:10:47:13 | name | semmle.label | name |
| PolynomialReDoS.rb:54:12:54:17 | call to params : | semmle.label | call to params : |
| PolynomialReDoS.rb:54:12:54:24 | ...[...] : | semmle.label | ...[...] : |
| PolynomialReDoS.rb:56:38:56:41 | name : | semmle.label | name : |
| PolynomialReDoS.rb:58:37:58:40 | name : | semmle.label | name : |
| PolynomialReDoS.rb:61:33:61:37 | input : | semmle.label | input : |
| PolynomialReDoS.rb:62:5:62:9 | input | semmle.label | input |
| PolynomialReDoS.rb:65:42:65:46 | input : | semmle.label | input : |
| PolynomialReDoS.rb:66:5:66:9 | input | semmle.label | input |
subpaths
#select
| PolynomialReDoS.rb:10:5:10:17 | ... =~ ... | PolynomialReDoS.rb:4:12:4:17 | call to params : | PolynomialReDoS.rb:10:5:10:8 | name | This $@ that depends on a $@ may run slow on strings with many repetitions of ' '. | PolynomialReDoS.rb:7:19:7:21 | \\s+ | regular expression | PolynomialReDoS.rb:4:12:4:17 | call to params | user-provided value |
@@ -74,3 +89,5 @@ subpaths
| PolynomialReDoS.rb:32:5:32:20 | call to sub! | PolynomialReDoS.rb:31:9:31:14 | call to params : | PolynomialReDoS.rb:32:5:32:5 | c | This $@ that depends on a $@ may run slow on strings with many repetitions of ' '. | PolynomialReDoS.rb:7:19:7:21 | \\s+ | regular expression | PolynomialReDoS.rb:31:9:31:14 | call to params | user-provided value |
| PolynomialReDoS.rb:42:5:45:7 | case ... | PolynomialReDoS.rb:4:12:4:17 | call to params : | PolynomialReDoS.rb:42:10:42:13 | name | This $@ that depends on a $@ may run slow on strings with many repetitions of ' '. | PolynomialReDoS.rb:7:19:7:21 | \\s+ | regular expression | PolynomialReDoS.rb:4:12:4:17 | call to params | user-provided value |
| PolynomialReDoS.rb:47:5:50:7 | case ... | PolynomialReDoS.rb:4:12:4:17 | call to params : | PolynomialReDoS.rb:47:10:47:13 | name | This $@ that depends on a $@ may run slow on strings with many repetitions of ' '. | PolynomialReDoS.rb:48:14:48:16 | \\s+ | regular expression | PolynomialReDoS.rb:4:12:4:17 | call to params | user-provided value |
| PolynomialReDoS.rb:62:5:62:22 | call to gsub | PolynomialReDoS.rb:54:12:54:17 | call to params : | PolynomialReDoS.rb:62:5:62:9 | input | This $@ that depends on a $@ may run slow on strings with many repetitions of ' '. | PolynomialReDoS.rb:56:31:56:33 | \\s+ | regular expression | PolynomialReDoS.rb:54:12:54:17 | call to params | user-provided value |
| PolynomialReDoS.rb:66:5:66:34 | call to match? | PolynomialReDoS.rb:54:12:54:17 | call to params : | PolynomialReDoS.rb:66:5:66:9 | input | This $@ that depends on a $@ may run slow on strings with many repetitions of ' '. | PolynomialReDoS.rb:58:30:58:32 | \\s+ | regular expression | PolynomialReDoS.rb:54:12:54:17 | call to params | user-provided value |