From c1c437f0206ec7bcbbca2cc807a2aa149df883de Mon Sep 17 00:00:00 2001
From: Nick Rolfe <nickrolfe@github.com>
Date: Fri, 16 Apr 2021 16:54:30 +0100
Subject: [PATCH] Minimal implementation of shared type-tracking library

---
 .../dataflow/internal/DataFlowDispatch.qll    |   6 +
 .../dataflow/internal/DataFlowPublic.qll      |  32 ++
 .../codeql_ruby/typetracking/TypeTracker.qll  | 420 ++++++++++++++++++
 .../typetracking/TypeTrackerPrivate.qll       | 119 +++++
 4 files changed, 577 insertions(+)
 create mode 100644 ql/src/codeql_ruby/typetracking/TypeTracker.qll
 create mode 100644 ql/src/codeql_ruby/typetracking/TypeTrackerPrivate.qll

diff --git a/ql/src/codeql_ruby/dataflow/internal/DataFlowDispatch.qll b/ql/src/codeql_ruby/dataflow/internal/DataFlowDispatch.qll
index b412793ffaa..a539e18b069 100644
--- a/ql/src/codeql_ruby/dataflow/internal/DataFlowDispatch.qll
+++ b/ql/src/codeql_ruby/dataflow/internal/DataFlowDispatch.qll
@@ -40,6 +40,12 @@ class DataFlowCallable = CfgScope;
 
 class DataFlowCall extends CfgNodes::ExprNodes::CallCfgNode {
   DataFlowCallable getEnclosingCallable() { result = this.getScope() }
+
+  DataFlowCallable getCallable() {
+    // TODO: this is a placeholder that finds a method with the same name, iff it's uniquely named.
+    result =
+      unique(DataFlowCallable c | c.(Method).getName() = this.getNode().(MethodCall).getMethodName())
+  }
 }
 
 /** Gets a viable run-time target for the call `call`. */
diff --git a/ql/src/codeql_ruby/dataflow/internal/DataFlowPublic.qll b/ql/src/codeql_ruby/dataflow/internal/DataFlowPublic.qll
index 7a8d40a5e1a..0a8716a0b07 100644
--- a/ql/src/codeql_ruby/dataflow/internal/DataFlowPublic.qll
+++ b/ql/src/codeql_ruby/dataflow/internal/DataFlowPublic.qll
@@ -2,6 +2,7 @@ private import ruby
 private import DataFlowDispatch
 private import DataFlowPrivate
 private import codeql_ruby.CFG
+private import codeql_ruby.typetracking.TypeTracker
 
 /**
  * An element, viewed as a node in a data flow graph. Either an expression
@@ -36,6 +37,14 @@ class Node extends TNode {
   ) {
     getLocation().hasLocationInfo(filepath, startline, startcolumn, endline, endcolumn)
   }
+
+  /**
+   * Gets a node that this node may flow to using one heap and/or interprocedural step.
+   *
+   * See `TypeTracker` for more details about how to use this.
+   */
+  pragma[inline]
+  Node track(TypeTracker t2, TypeTracker t) { t = t2.step(this, result) }
 }
 
 /**
@@ -73,6 +82,29 @@ class ParameterNode extends Node, TParameterNode {
   predicate isParameterOf(Callable c, int i) { p = c.getParameter(i) }
 }
 
+/**
+ * A data-flow node that is a source of local flow.
+ */
+class LocalSourceNode extends Node {
+  LocalSourceNode() { not simpleLocalFlowStep+(any(ExprNode n), this) }
+
+  /** Holds if this `LocalSourceNode` can flow to `nodeTo` in one or more local flow steps. */
+  pragma[inline]
+  predicate flowsTo(Node nodeTo) { hasLocalSource(nodeTo, this) }
+}
+
+predicate hasLocalSource(Node sink, Node source) {
+  // Declaring `source` to be a `SourceNode` currently causes a redundant check in the
+  // recursive case, so instead we check it explicitly here.
+  source = sink and
+  source instanceof LocalSourceNode
+  or
+  exists(Node mid |
+    hasLocalSource(mid, source) and
+    simpleLocalFlowStep(mid, sink)
+  )
+}
+
 /** Gets a node corresponding to expression `e`. */
 ExprNode exprNode(CfgNodes::ExprCfgNode e) { result.getExprNode() = e }
 
diff --git a/ql/src/codeql_ruby/typetracking/TypeTracker.qll b/ql/src/codeql_ruby/typetracking/TypeTracker.qll
new file mode 100644
index 00000000000..46461d3e22e
--- /dev/null
+++ b/ql/src/codeql_ruby/typetracking/TypeTracker.qll
@@ -0,0 +1,420 @@
+/** Step Summaries and Type Tracking */
+
+private import TypeTrackerPrivate
+
+/**
+ * Any string that may appear as the name of a piece of content. This will usually include things like:
+ * - Attribute names (in Python)
+ * - Property names (in JavaScript)
+ *
+ * In general, this can also be used to model things like stores to specific list indices. To ensure
+ * correctness, it is important that
+ *
+ * - different types of content do not have overlapping names, and
+ * - the empty string `""` is not a valid piece of content, as it is used to indicate the absence of
+ *   content instead.
+ */
+class ContentName extends string {
+  ContentName() { this = getPossibleContentName() }
+}
+
+/** Either a content name, or the empty string (representing no content). */
+class OptionalContentName extends string {
+  OptionalContentName() { this instanceof ContentName or this = "" }
+}
+
+/**
+ * A description of a step on an inter-procedural data flow path.
+ */
+private newtype TStepSummary =
+  LevelStep() or
+  CallStep() or
+  ReturnStep() or
+  StoreStep(ContentName content) or
+  LoadStep(ContentName content)
+
+/**
+ * INTERNAL: Use `TypeTracker` or `TypeBackTracker` instead.
+ *
+ * A description of a step on an inter-procedural data flow path.
+ */
+class StepSummary extends TStepSummary {
+  /** Gets a textual representation of this step summary. */
+  string toString() {
+    this instanceof LevelStep and result = "level"
+    or
+    this instanceof CallStep and result = "call"
+    or
+    this instanceof ReturnStep and result = "return"
+    or
+    exists(string content | this = StoreStep(content) | result = "store " + content)
+    or
+    exists(string content | this = LoadStep(content) | result = "load " + content)
+  }
+}
+
+/** Provides predicates for updating step summaries (`StepSummary`s). */
+module StepSummary {
+  /**
+   * Gets the summary that corresponds to having taken a forwards
+   * heap and/or inter-procedural step from `nodeFrom` to `nodeTo`.
+   */
+  cached
+  predicate step(LocalSourceNode nodeFrom, LocalSourceNode nodeTo, StepSummary summary) {
+    exists(Node mid | nodeFrom.flowsTo(mid) and smallstep(mid, nodeTo, summary))
+  }
+
+  /**
+   * Gets the summary that corresponds to having taken a forwards
+   * local, heap and/or inter-procedural step from `nodeFrom` to `nodeTo`.
+   *
+   * Unlike `StepSummary::step`, this predicate does not compress
+   * type-preserving steps.
+   */
+  predicate smallstep(Node nodeFrom, Node nodeTo, StepSummary summary) {
+    typePreservingStep(nodeFrom, nodeTo) and
+    summary = LevelStep()
+    or
+    callStep(nodeFrom, nodeTo) and summary = CallStep()
+    or
+    returnStep(nodeFrom, nodeTo) and
+    summary = ReturnStep()
+    or
+    exists(string content |
+      localSourceStoreStep(nodeFrom, nodeTo, content) and
+      summary = StoreStep(content)
+      or
+      basicLoadStep(nodeFrom, nodeTo, content) and summary = LoadStep(content)
+    )
+  }
+
+  /**
+   * Holds if `nodeFrom` is being written to the `content` content of the object in `nodeTo`.
+   *
+   * Note that `nodeTo` will always be a local source node that flows to the place where the content
+   * is written in `basicStoreStep`. This may lead to the flow of information going "back in time"
+   * from the point of view of the execution of the program.
+   *
+   * For instance, if we interpret attribute writes in Python as writing to content with the same
+   * name as the attribute and consider the following snippet
+   *
+   * ```python
+   * def foo(y):
+   *    x = Foo()
+   *    bar(x)
+   *    x.attr = y
+   *    baz(x)
+   *
+   * def bar(x):
+   *    z = x.attr
+   * ```
+   * for the attribute write `x.attr = y`, we will have `content` being the literal string `"attr"`,
+   * `nodeFrom` will be `y`, and `nodeTo` will be the object `Foo()` created on the first line of the
+   * function. This means we will track the fact that `x.attr` can have the type of `y` into the
+   * assignment to `z` inside `bar`, even though this attribute write happens _after_ `bar` is called.
+   */
+  predicate localSourceStoreStep(Node nodeFrom, LocalSourceNode nodeTo, string content) {
+    exists(Node obj | nodeTo.flowsTo(obj) and basicStoreStep(nodeFrom, obj, content))
+  }
+}
+
+private newtype TTypeTracker = MkTypeTracker(Boolean hasCall, OptionalContentName content)
+
+/**
+ * Summary of the steps needed to track a value to a given dataflow node.
+ *
+ * This can be used to track objects that implement a certain API in order to
+ * recognize calls to that API. Note that type-tracking does not by itself provide a
+ * source/sink relation, that is, it may determine that a node has a given type,
+ * but it won't determine where that type came from.
+ *
+ * It is recommended that all uses of this type are written in the following form,
+ * for tracking some type `myType`:
+ * ```ql
+ * DataFlow::LocalSourceNode myType(DataFlow::TypeTracker t) {
+ *   t.start() and
+ *   result = < source of myType >
+ *   or
+ *   exists (DataFlow::TypeTracker t2 |
+ *     result = myType(t2).track(t2, t)
+ *   )
+ * }
+ *
+ * DataFlow::Node myType() { myType(DataFlow::TypeTracker::end()).flowsTo(result) }
+ * ```
+ *
+ * Instead of `result = myType(t2).track(t2, t)`, you can also use the equivalent
+ * `t = t2.step(myType(t2), result)`. If you additionally want to track individual
+ * intra-procedural steps, use `t = t2.smallstep(myCallback(t2), result)`.
+ */
+class TypeTracker extends TTypeTracker {
+  Boolean hasCall;
+  OptionalContentName content;
+
+  TypeTracker() { this = MkTypeTracker(hasCall, content) }
+
+  /** Gets the summary resulting from appending `step` to this type-tracking summary. */
+  cached
+  TypeTracker append(StepSummary step) {
+    step = LevelStep() and result = this
+    or
+    step = CallStep() and result = MkTypeTracker(true, content)
+    or
+    step = ReturnStep() and hasCall = false and result = this
+    or
+    step = LoadStep(content) and result = MkTypeTracker(hasCall, "")
+    or
+    exists(string p | step = StoreStep(p) and content = "" and result = MkTypeTracker(hasCall, p))
+  }
+
+  /** Gets a textual representation of this summary. */
+  string toString() {
+    exists(string withCall, string withContent |
+      (if hasCall = true then withCall = "with" else withCall = "without") and
+      (if content != "" then withContent = " with content " + content else withContent = "") and
+      result = "type tracker " + withCall + " call steps" + withContent
+    )
+  }
+
+  /**
+   * Holds if this is the starting point of type tracking.
+   */
+  predicate start() { hasCall = false and content = "" }
+
+  /**
+   * Holds if this is the starting point of type tracking, and the value starts in the content named `contentName`.
+   * The type tracking only ends after the content has been loaded.
+   */
+  predicate startInContent(ContentName contentName) { hasCall = false and content = contentName }
+
+  /**
+   * Holds if this is the starting point of type tracking
+   * when tracking a parameter into a call, but not out of it.
+   */
+  predicate call() { hasCall = true and content = "" }
+
+  /**
+   * Holds if this is the end point of type tracking.
+   */
+  predicate end() { content = "" }
+
+  /**
+   * INTERNAL. DO NOT USE.
+   *
+   * Holds if this type has been tracked into a call.
+   */
+  boolean hasCall() { result = hasCall }
+
+  /**
+   * INTERNAL. DO NOT USE.
+   *
+   * Gets the content associated with this type tracker.
+   */
+  string getContent() { result = content }
+
+  /**
+   * Gets a type tracker that starts where this one has left off to allow continued
+   * tracking.
+   *
+   * This predicate is only defined if the type is not associated to a piece of content.
+   */
+  TypeTracker continue() { content = "" and result = this }
+
+  /**
+   * Gets the summary that corresponds to having taken a forwards
+   * heap and/or inter-procedural step from `nodeFrom` to `nodeTo`.
+   */
+  pragma[inline]
+  TypeTracker step(LocalSourceNode nodeFrom, Node nodeTo) {
+    exists(StepSummary summary |
+      StepSummary::step(nodeFrom, nodeTo, summary) and
+      result = this.append(summary)
+    )
+  }
+
+  /**
+   * Gets the summary that corresponds to having taken a forwards
+   * local, heap and/or inter-procedural step from `nodeFrom` to `nodeTo`.
+   *
+   * Unlike `TypeTracker::step`, this predicate exposes all edges
+   * in the flow graph, and not just the edges between `Node`s.
+   * It may therefore be less performant.
+   *
+   * Type tracking predicates using small steps typically take the following form:
+   * ```ql
+   * DataFlow::Node myType(DataFlow::TypeTracker t) {
+   *   t.start() and
+   *   result = < source of myType >
+   *   or
+   *   exists (DataFlow::TypeTracker t2 |
+   *     t = t2.smallstep(myType(t2), result)
+   *   )
+   * }
+   *
+   * DataFlow::Node myType() {
+   *   result = myType(DataFlow::TypeTracker::end())
+   * }
+   * ```
+   */
+  pragma[inline]
+  TypeTracker smallstep(Node nodeFrom, Node nodeTo) {
+    exists(StepSummary summary |
+      StepSummary::smallstep(nodeFrom, nodeTo, summary) and
+      result = this.append(summary)
+    )
+    or
+    typePreservingStep(nodeFrom, nodeTo) and
+    result = this
+  }
+}
+
+/** Provides predicates for implementing custom `TypeTracker`s. */
+module TypeTracker {
+  /**
+   * Gets a valid end point of type tracking.
+   */
+  TypeTracker end() { result.end() }
+}
+
+private newtype TTypeBackTracker = MkTypeBackTracker(Boolean hasReturn, OptionalContentName content)
+
+/**
+ * Summary of the steps needed to back-track a use of a value to a given dataflow node.
+ *
+ * This can for example be used to track callbacks that are passed to a certain API,
+ * so we can model specific parameters of that callback as having a certain type.
+ *
+ * Note that type back-tracking does not provide a source/sink relation, that is,
+ * it may determine that a node will be used in an API call somewhere, but it won't
+ * determine exactly where that use was, or the path that led to the use.
+ *
+ * It is recommended that all uses of this type are written in the following form,
+ * for back-tracking some callback type `myCallback`:
+ *
+ * ```ql
+ * DataFlow::LocalSourceNode myCallback(DataFlow::TypeBackTracker t) {
+ *   t.start() and
+ *   result = (< some API call >).getArgument(< n >).getALocalSource()
+ *   or
+ *   exists (DataFlow::TypeBackTracker t2 |
+ *     result = myCallback(t2).backtrack(t2, t)
+ *   )
+ * }
+ *
+ * DataFlow::LocalSourceNode myCallback() { result = myCallback(DataFlow::TypeBackTracker::end()) }
+ * ```
+ *
+ * Instead of `result = myCallback(t2).backtrack(t2, t)`, you can also use the equivalent
+ * `t2 = t.step(result, myCallback(t2))`. If you additionally want to track individual
+ * intra-procedural steps, use `t2 = t.smallstep(result, myCallback(t2))`.
+ */
+class TypeBackTracker extends TTypeBackTracker {
+  Boolean hasReturn;
+  string content;
+
+  TypeBackTracker() { this = MkTypeBackTracker(hasReturn, content) }
+
+  /** Gets the summary resulting from prepending `step` to this type-tracking summary. */
+  TypeBackTracker prepend(StepSummary step) {
+    step = LevelStep() and result = this
+    or
+    step = CallStep() and hasReturn = false and result = this
+    or
+    step = ReturnStep() and result = MkTypeBackTracker(true, content)
+    or
+    exists(string p |
+      step = LoadStep(p) and content = "" and result = MkTypeBackTracker(hasReturn, p)
+    )
+    or
+    step = StoreStep(content) and result = MkTypeBackTracker(hasReturn, "")
+  }
+
+  /** Gets a textual representation of this summary. */
+  string toString() {
+    exists(string withReturn, string withContent |
+      (if hasReturn = true then withReturn = "with" else withReturn = "without") and
+      (if content != "" then withContent = " with content " + content else withContent = "") and
+      result = "type back-tracker " + withReturn + " return steps" + withContent
+    )
+  }
+
+  /**
+   * Holds if this is the starting point of type tracking.
+   */
+  predicate start() { hasReturn = false and content = "" }
+
+  /**
+   * Holds if this is the end point of type tracking.
+   */
+  predicate end() { content = "" }
+
+  /**
+   * INTERNAL. DO NOT USE.
+   *
+   * Holds if this type has been back-tracked into a call through return edge.
+   */
+  boolean hasReturn() { result = hasReturn }
+
+  /**
+   * Gets a type tracker that starts where this one has left off to allow continued
+   * tracking.
+   *
+   * This predicate is only defined if the type has not been tracked into a piece of content.
+   */
+  TypeBackTracker continue() { content = "" and result = this }
+
+  /**
+   * Gets the summary that corresponds to having taken a backwards
+   * heap and/or inter-procedural step from `nodeTo` to `nodeFrom`.
+   */
+  pragma[inline]
+  TypeBackTracker step(LocalSourceNode nodeFrom, LocalSourceNode nodeTo) {
+    exists(StepSummary summary |
+      StepSummary::step(nodeFrom, nodeTo, summary) and
+      this = result.prepend(summary)
+    )
+  }
+
+  /**
+   * Gets the summary that corresponds to having taken a backwards
+   * local, heap and/or inter-procedural step from `nodeTo` to `nodeFrom`.
+   *
+   * Unlike `TypeBackTracker::step`, this predicate exposes all edges
+   * in the flowgraph, and not just the edges between
+   * `LocalSourceNode`s. It may therefore be less performant.
+   *
+   * Type tracking predicates using small steps typically take the following form:
+   * ```ql
+   * DataFlow::Node myType(DataFlow::TypeBackTracker t) {
+   *   t.start() and
+   *   result = < some API call >.getArgument(< n >)
+   *   or
+   *   exists (DataFlow::TypeBackTracker t2 |
+   *     t = t2.smallstep(result, myType(t2))
+   *   )
+   * }
+   *
+   * DataFlow::Node myType() {
+   *   result = myType(DataFlow::TypeBackTracker::end())
+   * }
+   * ```
+   */
+  pragma[inline]
+  TypeBackTracker smallstep(Node nodeFrom, Node nodeTo) {
+    exists(StepSummary summary |
+      StepSummary::smallstep(nodeFrom, nodeTo, summary) and
+      this = result.prepend(summary)
+    )
+    or
+    typePreservingStep(nodeFrom, nodeTo) and
+    this = result
+  }
+}
+
+/** Provides predicates for implementing custom `TypeBackTracker`s. */
+module TypeBackTracker {
+  /**
+   * Gets a valid end point of type back-tracking.
+   */
+  TypeBackTracker end() { result.end() }
+}
diff --git a/ql/src/codeql_ruby/typetracking/TypeTrackerPrivate.qll b/ql/src/codeql_ruby/typetracking/TypeTrackerPrivate.qll
new file mode 100644
index 00000000000..447c60ffa93
--- /dev/null
+++ b/ql/src/codeql_ruby/typetracking/TypeTrackerPrivate.qll
@@ -0,0 +1,119 @@
+private import codeql_ruby.AST as AST
+private import codeql_ruby.dataflow.internal.DataFlowPublic as DataFlowPublic
+private import codeql_ruby.dataflow.internal.DataFlowPrivate as DataFlowPrivate
+private import codeql_ruby.dataflow.internal.DataFlowDispatch as DataFlowDispatch
+
+class Node = DataFlowPublic::Node;
+
+class LocalSourceNode = DataFlowPublic::LocalSourceNode;
+
+/** Holds if it's reasonable to expect the data flow step from `nodeFrom` to `nodeTo` to preserve types. */
+predicate typePreservingStep(Node nodeFrom, Node nodeTo) {
+  DataFlowPrivate::simpleLocalFlowStep(nodeFrom, nodeTo) or
+  DataFlowPrivate::jumpStep(nodeFrom, nodeTo)
+}
+
+/**
+ * Gets the name of a possible piece of content. This will usually include things like
+ *
+ * - Attribute names (in Python)
+ * - Property names (in JavaScript)
+ */
+string getPossibleContentName() { result = getSetterCallAttributeName(_) }
+
+/** Holds if `nodeFrom` steps to `nodeTo` by being passed as a parameter in a call. */
+predicate callStep(
+  DataFlowPrivate::ArgumentNode nodeFrom, DataFlowPrivate::ExplicitParameterNode nodeTo
+) {
+  exists(DataFlowDispatch::DataFlowCall call, DataFlowDispatch::DataFlowCallable callable, int i |
+    call.getCallable() = callable and
+    nodeFrom.argumentOf(call, i) and
+    nodeTo.isParameterOf(callable, i)
+  )
+}
+
+/** Holds if `nodeFrom` steps to `nodeTo` by being returned from a call. */
+predicate returnStep(DataFlowPrivate::ReturnNode nodeFrom, Node nodeTo) {
+  exists(DataFlowDispatch::DataFlowCall call |
+    nodeFrom.getEnclosingCallable() = call.getCallable() and
+    nodeTo.asExpr().getNode() = call.getNode()
+  )
+}
+
+/**
+ * Holds if `nodeFrom` is being written to the `content` content of the object
+ * in `nodeTo`.
+ *
+ * Note that the choice of `nodeTo` does not have to make sense
+ * "chronologically". All we care about is whether the `content` content of
+ * `nodeTo` can have a specific type, and the assumption is that if a specific
+ * type appears here, then any access of that particular content can yield
+ * something of that particular type.
+ *
+ * Thus, in an example such as
+ *
+ * ```rb
+ * def foo(y)
+ *    x = Foo.new
+ *    bar(x)
+ *    x.content = y
+ *    baz(x)
+ * end
+ *
+ * def bar(x)
+ *    z = x.content
+ * end
+ * ```
+ * for the content write `x.content = y`, we will have `content` being the
+ * literal string `"content"`, `nodeFrom` will be `y`, and `nodeTo` will be the
+ * `Foo` object created on the first line of the function. This means we will
+ * track the fact that `x.content` can have the type of `y` into the assignment
+ * to `z` inside `bar`, even though this content write happens _after_ `bar` is
+ * called.
+ */
+predicate basicStoreStep(Node nodeFrom, LocalSourceNode nodeTo, string content) {
+  // TODO: support SetterMethodCall inside TuplePattern
+  exists(AST::Assignment assignment, AST::SetterMethodCall call, DataFlowPublic::ExprNode receiver |
+    assignment.getLeftOperand() = call and
+    content = getSetterCallAttributeName(call) and
+    receiver.getExprNode().getNode() = call.getReceiver() and
+    assignment.getRightOperand() = nodeFrom.(DataFlowPublic::ExprNode).getExprNode().getNode() and
+    nodeTo.flowsTo(receiver)
+  )
+}
+
+/**
+ * Returns the name of the attribute being set by the setter method call, i.e.
+ * the name of the setter method without the trailing `=`. In the following
+ * example, the result is `"bar"`.
+ *
+ * ```rb
+ * foo.bar = 1
+ * ```
+ */
+private string getSetterCallAttributeName(AST::SetterMethodCall call) {
+  // TODO: this should be exposed in `SetterMethodCall`
+  not call instanceof AST::ElementReference and
+  exists(string setterName |
+    setterName = call.getMethodName() and result = setterName.prefix(setterName.length() - 1)
+  )
+}
+
+/**
+ * Holds if `nodeTo` is the result of accessing the `content` content of `nodeFrom`.
+ */
+predicate basicLoadStep(Node nodeFrom, Node nodeTo, string content) {
+  exists(AST::MethodCall call |
+    call.getNumberOfArguments() = 0 and
+    content = call.getMethodName() and
+    nodeFrom.asExpr().getNode() = call.getReceiver() and
+    nodeTo.asExpr().getNode() = call
+  )
+}
+
+/**
+ * A utility class that is equivalent to `boolean` but does not require type joining.
+ */
+class Boolean extends boolean {
+  Boolean() { this = true or this = false }
+}