From c1c437f0206ec7bcbbca2cc807a2aa149df883de Mon Sep 17 00:00:00 2001 From: Nick Rolfe Date: Fri, 16 Apr 2021 16:54:30 +0100 Subject: [PATCH] Minimal implementation of shared type-tracking library --- .../dataflow/internal/DataFlowDispatch.qll | 6 + .../dataflow/internal/DataFlowPublic.qll | 32 ++ .../codeql_ruby/typetracking/TypeTracker.qll | 420 ++++++++++++++++++ .../typetracking/TypeTrackerPrivate.qll | 119 +++++ 4 files changed, 577 insertions(+) create mode 100644 ql/src/codeql_ruby/typetracking/TypeTracker.qll create mode 100644 ql/src/codeql_ruby/typetracking/TypeTrackerPrivate.qll diff --git a/ql/src/codeql_ruby/dataflow/internal/DataFlowDispatch.qll b/ql/src/codeql_ruby/dataflow/internal/DataFlowDispatch.qll index b412793ffaa..a539e18b069 100644 --- a/ql/src/codeql_ruby/dataflow/internal/DataFlowDispatch.qll +++ b/ql/src/codeql_ruby/dataflow/internal/DataFlowDispatch.qll @@ -40,6 +40,12 @@ class DataFlowCallable = CfgScope; class DataFlowCall extends CfgNodes::ExprNodes::CallCfgNode { DataFlowCallable getEnclosingCallable() { result = this.getScope() } + + DataFlowCallable getCallable() { + // TODO: this is a placeholder that finds a method with the same name, iff it's uniquely named. + result = + unique(DataFlowCallable c | c.(Method).getName() = this.getNode().(MethodCall).getMethodName()) + } } /** Gets a viable run-time target for the call `call`. */ diff --git a/ql/src/codeql_ruby/dataflow/internal/DataFlowPublic.qll b/ql/src/codeql_ruby/dataflow/internal/DataFlowPublic.qll index 7a8d40a5e1a..0a8716a0b07 100644 --- a/ql/src/codeql_ruby/dataflow/internal/DataFlowPublic.qll +++ b/ql/src/codeql_ruby/dataflow/internal/DataFlowPublic.qll @@ -2,6 +2,7 @@ private import ruby private import DataFlowDispatch private import DataFlowPrivate private import codeql_ruby.CFG +private import codeql_ruby.typetracking.TypeTracker /** * An element, viewed as a node in a data flow graph. Either an expression @@ -36,6 +37,14 @@ class Node extends TNode { ) { getLocation().hasLocationInfo(filepath, startline, startcolumn, endline, endcolumn) } + + /** + * Gets a node that this node may flow to using one heap and/or interprocedural step. + * + * See `TypeTracker` for more details about how to use this. + */ + pragma[inline] + Node track(TypeTracker t2, TypeTracker t) { t = t2.step(this, result) } } /** @@ -73,6 +82,29 @@ class ParameterNode extends Node, TParameterNode { predicate isParameterOf(Callable c, int i) { p = c.getParameter(i) } } +/** + * A data-flow node that is a source of local flow. + */ +class LocalSourceNode extends Node { + LocalSourceNode() { not simpleLocalFlowStep+(any(ExprNode n), this) } + + /** Holds if this `LocalSourceNode` can flow to `nodeTo` in one or more local flow steps. */ + pragma[inline] + predicate flowsTo(Node nodeTo) { hasLocalSource(nodeTo, this) } +} + +predicate hasLocalSource(Node sink, Node source) { + // Declaring `source` to be a `SourceNode` currently causes a redundant check in the + // recursive case, so instead we check it explicitly here. + source = sink and + source instanceof LocalSourceNode + or + exists(Node mid | + hasLocalSource(mid, source) and + simpleLocalFlowStep(mid, sink) + ) +} + /** Gets a node corresponding to expression `e`. */ ExprNode exprNode(CfgNodes::ExprCfgNode e) { result.getExprNode() = e } diff --git a/ql/src/codeql_ruby/typetracking/TypeTracker.qll b/ql/src/codeql_ruby/typetracking/TypeTracker.qll new file mode 100644 index 00000000000..46461d3e22e --- /dev/null +++ b/ql/src/codeql_ruby/typetracking/TypeTracker.qll @@ -0,0 +1,420 @@ +/** Step Summaries and Type Tracking */ + +private import TypeTrackerPrivate + +/** + * Any string that may appear as the name of a piece of content. This will usually include things like: + * - Attribute names (in Python) + * - Property names (in JavaScript) + * + * In general, this can also be used to model things like stores to specific list indices. To ensure + * correctness, it is important that + * + * - different types of content do not have overlapping names, and + * - the empty string `""` is not a valid piece of content, as it is used to indicate the absence of + * content instead. + */ +class ContentName extends string { + ContentName() { this = getPossibleContentName() } +} + +/** Either a content name, or the empty string (representing no content). */ +class OptionalContentName extends string { + OptionalContentName() { this instanceof ContentName or this = "" } +} + +/** + * A description of a step on an inter-procedural data flow path. + */ +private newtype TStepSummary = + LevelStep() or + CallStep() or + ReturnStep() or + StoreStep(ContentName content) or + LoadStep(ContentName content) + +/** + * INTERNAL: Use `TypeTracker` or `TypeBackTracker` instead. + * + * A description of a step on an inter-procedural data flow path. + */ +class StepSummary extends TStepSummary { + /** Gets a textual representation of this step summary. */ + string toString() { + this instanceof LevelStep and result = "level" + or + this instanceof CallStep and result = "call" + or + this instanceof ReturnStep and result = "return" + or + exists(string content | this = StoreStep(content) | result = "store " + content) + or + exists(string content | this = LoadStep(content) | result = "load " + content) + } +} + +/** Provides predicates for updating step summaries (`StepSummary`s). */ +module StepSummary { + /** + * Gets the summary that corresponds to having taken a forwards + * heap and/or inter-procedural step from `nodeFrom` to `nodeTo`. + */ + cached + predicate step(LocalSourceNode nodeFrom, LocalSourceNode nodeTo, StepSummary summary) { + exists(Node mid | nodeFrom.flowsTo(mid) and smallstep(mid, nodeTo, summary)) + } + + /** + * Gets the summary that corresponds to having taken a forwards + * local, heap and/or inter-procedural step from `nodeFrom` to `nodeTo`. + * + * Unlike `StepSummary::step`, this predicate does not compress + * type-preserving steps. + */ + predicate smallstep(Node nodeFrom, Node nodeTo, StepSummary summary) { + typePreservingStep(nodeFrom, nodeTo) and + summary = LevelStep() + or + callStep(nodeFrom, nodeTo) and summary = CallStep() + or + returnStep(nodeFrom, nodeTo) and + summary = ReturnStep() + or + exists(string content | + localSourceStoreStep(nodeFrom, nodeTo, content) and + summary = StoreStep(content) + or + basicLoadStep(nodeFrom, nodeTo, content) and summary = LoadStep(content) + ) + } + + /** + * Holds if `nodeFrom` is being written to the `content` content of the object in `nodeTo`. + * + * Note that `nodeTo` will always be a local source node that flows to the place where the content + * is written in `basicStoreStep`. This may lead to the flow of information going "back in time" + * from the point of view of the execution of the program. + * + * For instance, if we interpret attribute writes in Python as writing to content with the same + * name as the attribute and consider the following snippet + * + * ```python + * def foo(y): + * x = Foo() + * bar(x) + * x.attr = y + * baz(x) + * + * def bar(x): + * z = x.attr + * ``` + * for the attribute write `x.attr = y`, we will have `content` being the literal string `"attr"`, + * `nodeFrom` will be `y`, and `nodeTo` will be the object `Foo()` created on the first line of the + * function. This means we will track the fact that `x.attr` can have the type of `y` into the + * assignment to `z` inside `bar`, even though this attribute write happens _after_ `bar` is called. + */ + predicate localSourceStoreStep(Node nodeFrom, LocalSourceNode nodeTo, string content) { + exists(Node obj | nodeTo.flowsTo(obj) and basicStoreStep(nodeFrom, obj, content)) + } +} + +private newtype TTypeTracker = MkTypeTracker(Boolean hasCall, OptionalContentName content) + +/** + * Summary of the steps needed to track a value to a given dataflow node. + * + * This can be used to track objects that implement a certain API in order to + * recognize calls to that API. Note that type-tracking does not by itself provide a + * source/sink relation, that is, it may determine that a node has a given type, + * but it won't determine where that type came from. + * + * It is recommended that all uses of this type are written in the following form, + * for tracking some type `myType`: + * ```ql + * DataFlow::LocalSourceNode myType(DataFlow::TypeTracker t) { + * t.start() and + * result = < source of myType > + * or + * exists (DataFlow::TypeTracker t2 | + * result = myType(t2).track(t2, t) + * ) + * } + * + * DataFlow::Node myType() { myType(DataFlow::TypeTracker::end()).flowsTo(result) } + * ``` + * + * Instead of `result = myType(t2).track(t2, t)`, you can also use the equivalent + * `t = t2.step(myType(t2), result)`. If you additionally want to track individual + * intra-procedural steps, use `t = t2.smallstep(myCallback(t2), result)`. + */ +class TypeTracker extends TTypeTracker { + Boolean hasCall; + OptionalContentName content; + + TypeTracker() { this = MkTypeTracker(hasCall, content) } + + /** Gets the summary resulting from appending `step` to this type-tracking summary. */ + cached + TypeTracker append(StepSummary step) { + step = LevelStep() and result = this + or + step = CallStep() and result = MkTypeTracker(true, content) + or + step = ReturnStep() and hasCall = false and result = this + or + step = LoadStep(content) and result = MkTypeTracker(hasCall, "") + or + exists(string p | step = StoreStep(p) and content = "" and result = MkTypeTracker(hasCall, p)) + } + + /** Gets a textual representation of this summary. */ + string toString() { + exists(string withCall, string withContent | + (if hasCall = true then withCall = "with" else withCall = "without") and + (if content != "" then withContent = " with content " + content else withContent = "") and + result = "type tracker " + withCall + " call steps" + withContent + ) + } + + /** + * Holds if this is the starting point of type tracking. + */ + predicate start() { hasCall = false and content = "" } + + /** + * Holds if this is the starting point of type tracking, and the value starts in the content named `contentName`. + * The type tracking only ends after the content has been loaded. + */ + predicate startInContent(ContentName contentName) { hasCall = false and content = contentName } + + /** + * Holds if this is the starting point of type tracking + * when tracking a parameter into a call, but not out of it. + */ + predicate call() { hasCall = true and content = "" } + + /** + * Holds if this is the end point of type tracking. + */ + predicate end() { content = "" } + + /** + * INTERNAL. DO NOT USE. + * + * Holds if this type has been tracked into a call. + */ + boolean hasCall() { result = hasCall } + + /** + * INTERNAL. DO NOT USE. + * + * Gets the content associated with this type tracker. + */ + string getContent() { result = content } + + /** + * Gets a type tracker that starts where this one has left off to allow continued + * tracking. + * + * This predicate is only defined if the type is not associated to a piece of content. + */ + TypeTracker continue() { content = "" and result = this } + + /** + * Gets the summary that corresponds to having taken a forwards + * heap and/or inter-procedural step from `nodeFrom` to `nodeTo`. + */ + pragma[inline] + TypeTracker step(LocalSourceNode nodeFrom, Node nodeTo) { + exists(StepSummary summary | + StepSummary::step(nodeFrom, nodeTo, summary) and + result = this.append(summary) + ) + } + + /** + * Gets the summary that corresponds to having taken a forwards + * local, heap and/or inter-procedural step from `nodeFrom` to `nodeTo`. + * + * Unlike `TypeTracker::step`, this predicate exposes all edges + * in the flow graph, and not just the edges between `Node`s. + * It may therefore be less performant. + * + * Type tracking predicates using small steps typically take the following form: + * ```ql + * DataFlow::Node myType(DataFlow::TypeTracker t) { + * t.start() and + * result = < source of myType > + * or + * exists (DataFlow::TypeTracker t2 | + * t = t2.smallstep(myType(t2), result) + * ) + * } + * + * DataFlow::Node myType() { + * result = myType(DataFlow::TypeTracker::end()) + * } + * ``` + */ + pragma[inline] + TypeTracker smallstep(Node nodeFrom, Node nodeTo) { + exists(StepSummary summary | + StepSummary::smallstep(nodeFrom, nodeTo, summary) and + result = this.append(summary) + ) + or + typePreservingStep(nodeFrom, nodeTo) and + result = this + } +} + +/** Provides predicates for implementing custom `TypeTracker`s. */ +module TypeTracker { + /** + * Gets a valid end point of type tracking. + */ + TypeTracker end() { result.end() } +} + +private newtype TTypeBackTracker = MkTypeBackTracker(Boolean hasReturn, OptionalContentName content) + +/** + * Summary of the steps needed to back-track a use of a value to a given dataflow node. + * + * This can for example be used to track callbacks that are passed to a certain API, + * so we can model specific parameters of that callback as having a certain type. + * + * Note that type back-tracking does not provide a source/sink relation, that is, + * it may determine that a node will be used in an API call somewhere, but it won't + * determine exactly where that use was, or the path that led to the use. + * + * It is recommended that all uses of this type are written in the following form, + * for back-tracking some callback type `myCallback`: + * + * ```ql + * DataFlow::LocalSourceNode myCallback(DataFlow::TypeBackTracker t) { + * t.start() and + * result = (< some API call >).getArgument(< n >).getALocalSource() + * or + * exists (DataFlow::TypeBackTracker t2 | + * result = myCallback(t2).backtrack(t2, t) + * ) + * } + * + * DataFlow::LocalSourceNode myCallback() { result = myCallback(DataFlow::TypeBackTracker::end()) } + * ``` + * + * Instead of `result = myCallback(t2).backtrack(t2, t)`, you can also use the equivalent + * `t2 = t.step(result, myCallback(t2))`. If you additionally want to track individual + * intra-procedural steps, use `t2 = t.smallstep(result, myCallback(t2))`. + */ +class TypeBackTracker extends TTypeBackTracker { + Boolean hasReturn; + string content; + + TypeBackTracker() { this = MkTypeBackTracker(hasReturn, content) } + + /** Gets the summary resulting from prepending `step` to this type-tracking summary. */ + TypeBackTracker prepend(StepSummary step) { + step = LevelStep() and result = this + or + step = CallStep() and hasReturn = false and result = this + or + step = ReturnStep() and result = MkTypeBackTracker(true, content) + or + exists(string p | + step = LoadStep(p) and content = "" and result = MkTypeBackTracker(hasReturn, p) + ) + or + step = StoreStep(content) and result = MkTypeBackTracker(hasReturn, "") + } + + /** Gets a textual representation of this summary. */ + string toString() { + exists(string withReturn, string withContent | + (if hasReturn = true then withReturn = "with" else withReturn = "without") and + (if content != "" then withContent = " with content " + content else withContent = "") and + result = "type back-tracker " + withReturn + " return steps" + withContent + ) + } + + /** + * Holds if this is the starting point of type tracking. + */ + predicate start() { hasReturn = false and content = "" } + + /** + * Holds if this is the end point of type tracking. + */ + predicate end() { content = "" } + + /** + * INTERNAL. DO NOT USE. + * + * Holds if this type has been back-tracked into a call through return edge. + */ + boolean hasReturn() { result = hasReturn } + + /** + * Gets a type tracker that starts where this one has left off to allow continued + * tracking. + * + * This predicate is only defined if the type has not been tracked into a piece of content. + */ + TypeBackTracker continue() { content = "" and result = this } + + /** + * Gets the summary that corresponds to having taken a backwards + * heap and/or inter-procedural step from `nodeTo` to `nodeFrom`. + */ + pragma[inline] + TypeBackTracker step(LocalSourceNode nodeFrom, LocalSourceNode nodeTo) { + exists(StepSummary summary | + StepSummary::step(nodeFrom, nodeTo, summary) and + this = result.prepend(summary) + ) + } + + /** + * Gets the summary that corresponds to having taken a backwards + * local, heap and/or inter-procedural step from `nodeTo` to `nodeFrom`. + * + * Unlike `TypeBackTracker::step`, this predicate exposes all edges + * in the flowgraph, and not just the edges between + * `LocalSourceNode`s. It may therefore be less performant. + * + * Type tracking predicates using small steps typically take the following form: + * ```ql + * DataFlow::Node myType(DataFlow::TypeBackTracker t) { + * t.start() and + * result = < some API call >.getArgument(< n >) + * or + * exists (DataFlow::TypeBackTracker t2 | + * t = t2.smallstep(result, myType(t2)) + * ) + * } + * + * DataFlow::Node myType() { + * result = myType(DataFlow::TypeBackTracker::end()) + * } + * ``` + */ + pragma[inline] + TypeBackTracker smallstep(Node nodeFrom, Node nodeTo) { + exists(StepSummary summary | + StepSummary::smallstep(nodeFrom, nodeTo, summary) and + this = result.prepend(summary) + ) + or + typePreservingStep(nodeFrom, nodeTo) and + this = result + } +} + +/** Provides predicates for implementing custom `TypeBackTracker`s. */ +module TypeBackTracker { + /** + * Gets a valid end point of type back-tracking. + */ + TypeBackTracker end() { result.end() } +} diff --git a/ql/src/codeql_ruby/typetracking/TypeTrackerPrivate.qll b/ql/src/codeql_ruby/typetracking/TypeTrackerPrivate.qll new file mode 100644 index 00000000000..447c60ffa93 --- /dev/null +++ b/ql/src/codeql_ruby/typetracking/TypeTrackerPrivate.qll @@ -0,0 +1,119 @@ +private import codeql_ruby.AST as AST +private import codeql_ruby.dataflow.internal.DataFlowPublic as DataFlowPublic +private import codeql_ruby.dataflow.internal.DataFlowPrivate as DataFlowPrivate +private import codeql_ruby.dataflow.internal.DataFlowDispatch as DataFlowDispatch + +class Node = DataFlowPublic::Node; + +class LocalSourceNode = DataFlowPublic::LocalSourceNode; + +/** Holds if it's reasonable to expect the data flow step from `nodeFrom` to `nodeTo` to preserve types. */ +predicate typePreservingStep(Node nodeFrom, Node nodeTo) { + DataFlowPrivate::simpleLocalFlowStep(nodeFrom, nodeTo) or + DataFlowPrivate::jumpStep(nodeFrom, nodeTo) +} + +/** + * Gets the name of a possible piece of content. This will usually include things like + * + * - Attribute names (in Python) + * - Property names (in JavaScript) + */ +string getPossibleContentName() { result = getSetterCallAttributeName(_) } + +/** Holds if `nodeFrom` steps to `nodeTo` by being passed as a parameter in a call. */ +predicate callStep( + DataFlowPrivate::ArgumentNode nodeFrom, DataFlowPrivate::ExplicitParameterNode nodeTo +) { + exists(DataFlowDispatch::DataFlowCall call, DataFlowDispatch::DataFlowCallable callable, int i | + call.getCallable() = callable and + nodeFrom.argumentOf(call, i) and + nodeTo.isParameterOf(callable, i) + ) +} + +/** Holds if `nodeFrom` steps to `nodeTo` by being returned from a call. */ +predicate returnStep(DataFlowPrivate::ReturnNode nodeFrom, Node nodeTo) { + exists(DataFlowDispatch::DataFlowCall call | + nodeFrom.getEnclosingCallable() = call.getCallable() and + nodeTo.asExpr().getNode() = call.getNode() + ) +} + +/** + * Holds if `nodeFrom` is being written to the `content` content of the object + * in `nodeTo`. + * + * Note that the choice of `nodeTo` does not have to make sense + * "chronologically". All we care about is whether the `content` content of + * `nodeTo` can have a specific type, and the assumption is that if a specific + * type appears here, then any access of that particular content can yield + * something of that particular type. + * + * Thus, in an example such as + * + * ```rb + * def foo(y) + * x = Foo.new + * bar(x) + * x.content = y + * baz(x) + * end + * + * def bar(x) + * z = x.content + * end + * ``` + * for the content write `x.content = y`, we will have `content` being the + * literal string `"content"`, `nodeFrom` will be `y`, and `nodeTo` will be the + * `Foo` object created on the first line of the function. This means we will + * track the fact that `x.content` can have the type of `y` into the assignment + * to `z` inside `bar`, even though this content write happens _after_ `bar` is + * called. + */ +predicate basicStoreStep(Node nodeFrom, LocalSourceNode nodeTo, string content) { + // TODO: support SetterMethodCall inside TuplePattern + exists(AST::Assignment assignment, AST::SetterMethodCall call, DataFlowPublic::ExprNode receiver | + assignment.getLeftOperand() = call and + content = getSetterCallAttributeName(call) and + receiver.getExprNode().getNode() = call.getReceiver() and + assignment.getRightOperand() = nodeFrom.(DataFlowPublic::ExprNode).getExprNode().getNode() and + nodeTo.flowsTo(receiver) + ) +} + +/** + * Returns the name of the attribute being set by the setter method call, i.e. + * the name of the setter method without the trailing `=`. In the following + * example, the result is `"bar"`. + * + * ```rb + * foo.bar = 1 + * ``` + */ +private string getSetterCallAttributeName(AST::SetterMethodCall call) { + // TODO: this should be exposed in `SetterMethodCall` + not call instanceof AST::ElementReference and + exists(string setterName | + setterName = call.getMethodName() and result = setterName.prefix(setterName.length() - 1) + ) +} + +/** + * Holds if `nodeTo` is the result of accessing the `content` content of `nodeFrom`. + */ +predicate basicLoadStep(Node nodeFrom, Node nodeTo, string content) { + exists(AST::MethodCall call | + call.getNumberOfArguments() = 0 and + content = call.getMethodName() and + nodeFrom.asExpr().getNode() = call.getReceiver() and + nodeTo.asExpr().getNode() = call + ) +} + +/** + * A utility class that is equivalent to `boolean` but does not require type joining. + */ +class Boolean extends boolean { + Boolean() { this = true or this = false } +}