Merge pull request #4752 from yoff/python-dataflow-unpacking-assignment

Python: Dataflow, unpacking assignment
2025-12-20 10:46:30 +01:00 · 2021-01-29 14:15:28 +01:00
parent be5b7bb4c4 0e0b18c214
commit cb195a0dc4
4 changed files with 1018 additions and 103 deletions
--- a/python/ql/src/semmle/python/dataflow/new/internal/DataFlowPrivate.qll
+++ b/python/ql/src/semmle/python/dataflow/new/internal/DataFlowPrivate.qll
@@ -161,6 +161,15 @@ module EssaFlow {
    nodeFrom.(CfgNode).getNode() =
      nodeTo.(EssaNode).getVar().getDefinition().(AssignmentDefinition).getValue()
    or
+    // Definition
+    //   `[a, b] = iterable`
+    //   nodeFrom = `iterable`, cfg node
+    //   nodeTo = `TIterableSequence([a, b])`
+    exists(UnpackingAssignmentDirectTarget target |
+      nodeFrom.asExpr() = target.getValue() and
+      nodeTo = TIterableSequenceNode(target)
+    )
+    or
    // With definition
    //   `with f(42) as x:`
    //   nodeFrom is `f(42)`, cfg node
@@ -174,6 +183,10 @@ module EssaFlow {
      contextManager.strictlyDominates(var)
    )
    or
+    // Parameter definition
+    //   `def foo(x):`
+    //   nodeFrom is `x`, cfgNode
+    //   nodeTo is `x`, essa var
    exists(ParameterDefinition pd |
      nodeFrom.asCfgNode() = pd.getDefiningNode() and
      nodeTo.asVar() = pd.getVariable()
@@ -196,6 +209,9 @@ module EssaFlow {
    // If expressions
    nodeFrom.asCfgNode() = nodeTo.asCfgNode().(IfExprNode).getAnOperand()
    or
+    // Flow inside an unpacking assignment
+    unpackingAssignmentFlowStep(nodeFrom, nodeTo)
+    or
    // Overflow keyword argument
    exists(CallNode call, CallableValue callable |
      call = callable.getACall() and
@@ -454,7 +470,7 @@ module ArgumentPassing {
      // argument unpacked from dict
      exists(string name |
        call_unpacks(call, mapping, callable, name, paramN) and
-        result = TKwUnpacked(call, callable, name)
+        result = TKwUnpackedNode(call, callable, name)
      )
    )
  }
@@ -891,6 +907,8 @@ predicate storeStep(Node nodeFrom, Content c, Node nodeTo) {
  or
  comprehensionStoreStep(nodeFrom, c, nodeTo)
  or
+  unpackingAssignmentStoreStep(nodeFrom, c, nodeTo)
+  or
  attributeStoreStep(nodeFrom, c, nodeTo)
  or
  posOverflowStoreStep(nodeFrom, c, nodeTo)
@@ -906,6 +924,7 @@ predicate listStoreStep(CfgNode nodeFrom, ListElementContent c, CfgNode nodeTo)
  //   nodeTo is the list, `[..., 42, ...]`, cfg node
  //   c denotes element of list
  nodeTo.getNode().(ListNode).getAnElement() = nodeFrom.getNode() and
+  not nodeTo.getNode() instanceof UnpackingAssignmentSequenceTarget and
  // Suppress unused variable warning
  c = c
 }
@@ -931,6 +950,7 @@ predicate tupleStoreStep(CfgNode nodeFrom, TupleElementContent c, CfgNode nodeTo
  //   c denotes element of tuple and index of nodeFrom
  exists(int n |
    nodeTo.getNode().(TupleNode).getElement(n) = nodeFrom.getNode() and
+    not nodeTo.getNode() instanceof UnpackingAssignmentSequenceTarget and
    c.getIndex() = n
  )
 }
@@ -1021,6 +1041,8 @@ predicate kwOverflowStoreStep(CfgNode nodeFrom, DictionaryElementContent c, Node
 predicate readStep(Node nodeFrom, Content c, Node nodeTo) {
  subscriptReadStep(nodeFrom, c, nodeTo)
  or
+  unpackingAssignmentReadStep(nodeFrom, c, nodeTo)
+  or
  popReadStep(nodeFrom, c, nodeTo)
  or
  comprehensionReadStep(nodeFrom, c, nodeTo)
@@ -1053,6 +1075,322 @@ predicate subscriptReadStep(CfgNode nodeFrom, Content c, CfgNode nodeTo) {
  )
 }

+/**
+ * The unpacking assignment takes the general form
+ * ```python
+ *   sequence = iterable
+ * ```
+ * where `sequence` is either a tuple or a list and it can contain wildcards.
+ * The iterable can be any iterable, which means that (CodeQL modeling of) content
+ * will need to change type if it should be transferred from the LHS to the RHS.
+ *
+ * Note that (CodeQL modeling of) content does not have to change type on data-flow
+ * paths _inside_ the LHS, as the different allowed syntaxes here are merely a convenience.
+ * Consequently, we model all LHS sequences as tuples, which have the more precise content
+ * model, making flow to the elements more precise. If an element is a starred variable,
+ * we will have to mutate the content type to be list content.
+ *
+ * We may for instance have
+ * ```python
+ *    (a, b) = ["a", SOURCE]  # RHS has content `ListElementContent`
+ * ```
+ * Due to the abstraction for list content, we do not know whether `SOURCE`
+ * ends up in `a` or in `b`, so we want to overapproximate and see it in both.
+ *
+ * Using wildcards we may have
+ * ```python
+ *   (a, *b) = ("a", "b", SOURCE)  # RHS has content `TupleElementContent(2)`
+ * ```
+ * Since the starred variables are always assigned (Python-)type list, `*b` will be
+ * `["b", SOURCE]`, and we will again overapproximate and assign it
+ * content corresponding to anything found in the RHS.
+ *
+ * For a precise transfer
+ * ```python
+ *    (a, b) = ("a", SOURCE)  # RHS has content `TupleElementContent(1)`
+ * ```
+ * we wish to keep the precision, so only `b` receives the tuple content at index 1.
+ *
+ * Finally, `sequence` is actually a pattern and can have a more complicated structure,
+ * such as
+ * ```python
+ *   (a, [b, *c]) = ("a", ["b", SOURCE])  # RHS has content `TupleElementContent(1); ListElementContent`
+ * ```
+ * where `a` should not receive content, but `b` and `c` should. `c` will be `[SOURCE]` so
+ * should have the content transferred, while `b` should read it.
+ *
+ * To transfer content from RHS to the elements of the LHS in the expression `sequence = iterable`,
+ * we use two synthetic nodes:
+ *
+ * - `TIterableSequence(sequence)` which captures the content-modeling the entire `sequence` will have
+ * (essentially just a copy of the content-modeling the RHS has)
+ *
+ * - `TIterableElement(sequence)` which captures the content-modeling that will be assigned to an element.
+ * Note that an empty access path means that the value we are tracking flows directly to the element.
+ *
+ *
+ * The `TIterableSequence(sequence)` is at this point superflous but becomes useful when handling recursive
+ * structures in the LHS, where `sequence` is some internal sequence node. We can have a uniform treatment
+ * by always having these two synthetic nodes. So we transfer to (or, in the recursive case, read into)
+ * `TIterableSequence(sequence)`, from which we take a read step to `TIterableElement(sequence)` and then a
+ * store step to `sequence`.
+ *
+ * This allows the unknown content from the RHS to be read into `TIterableElement(sequence)` and tuple content
+ * to then be stored into `sequence`. If the content is already tuple content, this inderection creates crosstalk
+ * between indices. Therefore, tuple content is never read into `TIterableElement(sequence)`; it is instead
+ * transferred directly from `TIterableSequence(sequence)` to `sequence` via a flow step. Such a flow step will
+ * also transfer other content, but only tuple content is further read from `sequence` into its elements.
+ *
+ * The strategy is then via several read-, store-, and flow steps:
+ * 1. [Flow] Content is transferred from `iterable` to `TIterableSequence(sequence)` via a
+ *    flow step. From here, everything happens on the LHS.
+ *
+ * 2. [Flow] Content is transferred from `TIterableSequence(sequence)` to `sequence` via a
+ *    flow step. (Here only tuple content is relevant.)
+ *
+ * 3. [Read] Content is read from `TIterableSequence(sequence)` into  `TIterableElement(sequence)`.
+ *    As `sequence` is modeled as a tuple, we will not read tuple content as that would allow
+ *    crosstalk.
+ *
+ * 4. [Store] Content is stored from `TIterableElement(sequence)` to `sequence`.
+ *    Content type is `TupleElementContent` with indices taken from the syntax.
+ *    For instance, if `sequence` is `(a, *b, c)`, content is written to index 0, 1, and 2.
+ *    This is adequate as the route through `TIterableElement(sequence)` does not transfer precise content.
+ *
+ * 5. [Read] Content is read from `sequence` to its elements.
+ *    a) If the element is a plain variable, the target is the corresponding essa node.
+ *
+ *    b) If the element is itself a sequence, with control-flow node `seq`, the target is `TIterableSequence(seq)`.
+ *
+ *    c) If the element is a starred variable, with control-flow node `v`, the target is `TIterableElement(v)`.
+ *
+ * 6. [Store] Content is stored from `TIterableElement(v)` to the essa variable for `v`, with
+ *    content type `ListElementContent`.
+ *
+ * 7. [Flow, Read, Store] Steps 2 through 7 are repeated for all recursive elements which are sequences.
+ *
+ *
+ * We illustrate the above steps on the assignment
+ *
+ * ```python
+ * (a, b) = ["a", SOURCE]
+ * ```
+ *
+ * Looking at the content propagation to `a`:
+ *   `["a", SOURCE]`: [ListElementContent]
+ *
+ * --Step 1-->
+ *
+ *   `TIterableSequence((a, b))`: [ListElementContent]
+ *
+ * --Step 3-->
+ *
+ *   `TIterableElement((a, b))`: []
+ *
+ * --Step 4-->
+ *
+ *   `(a, b)`: [TupleElementContent(0)]
+ *
+ * --Step 5a-->
+ *
+ *   `a`: []
+ *
+ * Meaning there is data-flow from the RHS to `a` (an over approximation). The same logic would be applied to show there is data-flow to `b`. Note that _Step 3_ and _Step 4_ would not have been needed if the RHS had been a tuple (since that would have been able to use _Step 2_ instead).
+ *
+ * Another, more complicated example:
+ * ```python
+ *   (a, [b, *c]) = ["a", [SOURCE]]
+ * ```
+ * where the path to `c` is
+ *
+ *   `["a", [SOURCE]]`: [ListElementContent; ListElementContent]
+ *
+ * --Step 1-->
+ *
+ *   `TIterableSequence((a, [b, *c]))`: [ListElementContent; ListElementContent]
+ *
+ * --Step 3-->
+ *
+ *   `TIterableElement((a, [b, *c]))`: [ListElementContent]
+ *
+ * --Step 4-->
+ *
+ *   `(a, [b, *c])`: [TupleElementContent(1); ListElementContent]
+ *
+ * --Step 5b-->
+ *
+ *   `TIterableSequence([b, *c])`: [ListElementContent]
+ *
+ * --Step 3-->
+ *
+ *   `TIterableElement([b, *c])`: []
+ *
+ * --Step 4-->
+ *
+ *   `[b, *c]`: [TupleElementContent(1)]
+ *
+ * --Step 5c-->
+ *
+ *   `TIterableElement(c)`: []
+ *
+ * --Step 6-->
+ *
+ *  `c`: [ListElementContent]
+ */
+module UnpackingAssignment {
+  /** A direct (or top-level) target of an unpacking assignment. */
+  class UnpackingAssignmentDirectTarget extends ControlFlowNode {
+    Expr value;
+
+    UnpackingAssignmentDirectTarget() {
+      this instanceof SequenceNode and
+      exists(Assign assign | this.getNode() = assign.getATarget() | value = assign.getValue())
+    }
+
+    Expr getValue() { result = value }
+  }
+
+  /** A (possibly recursive) target of an unpacking assignment. */
+  class UnpackingAssignmentTarget extends ControlFlowNode {
+    UnpackingAssignmentTarget() {
+      this instanceof UnpackingAssignmentDirectTarget
+      or
+      this = any(UnpackingAssignmentSequenceTarget parent).getAnElement()
+    }
+  }
+
+  /** A (possibly recursive) target of an unpacking assignment which is also a sequence. */
+  class UnpackingAssignmentSequenceTarget extends UnpackingAssignmentTarget {
+    UnpackingAssignmentSequenceTarget() { this instanceof SequenceNode }
+
+    ControlFlowNode getElement(int i) { result = this.(SequenceNode).getElement(i) }
+
+    ControlFlowNode getAnElement() { result = this.getElement(_) }
+  }
+
+  /**
+   * Step 2
+   * Data flows from `TIterableSequence(sequence)` to `sequence`
+   */
+  predicate unpackingAssignmentFlowStep(Node nodeFrom, Node nodeTo) {
+    exists(UnpackingAssignmentSequenceTarget target |
+      nodeFrom = TIterableSequenceNode(target) and
+      nodeTo.asCfgNode() = target
+    )
+  }
+
+  /**
+   * Step 3
+   * Data flows from `TIterableSequence(sequence)` into  `TIterableElement(sequence)`.
+   * As `sequence` is modeled as a tuple, we will not read tuple content as that would allow
+   * crosstalk.
+   */
+  predicate unpackingAssignmentConvertingReadStep(Node nodeFrom, Content c, Node nodeTo) {
+    exists(UnpackingAssignmentSequenceTarget target |
+      nodeFrom = TIterableSequenceNode(target) and
+      nodeTo = TIterableElementNode(target) and
+      (
+        c instanceof ListElementContent
+        or
+        c instanceof SetElementContent
+        // TODO: dict content in iterable unpacking not handled
+      )
+    )
+  }
+
+  /**
+   * Step 4
+   * Data flows from `TIterableElement(sequence)` to `sequence`.
+   * Content type is `TupleElementContent` with indices taken from the syntax.
+   * For instance, if `sequence` is `(a, *b, c)`, content is written to index 0, 1, and 2.
+   */
+  predicate unpackingAssignmentConvertingStoreStep(Node nodeFrom, Content c, Node nodeTo) {
+    exists(UnpackingAssignmentSequenceTarget target |
+      nodeFrom = TIterableElementNode(target) and
+      nodeTo.asCfgNode() = target and
+      exists(int index | exists(target.getElement(index)) |
+        c.(TupleElementContent).getIndex() = index
+      )
+    )
+  }
+
+  /**
+   * Step 5
+   * For a sequence node inside an iterable unpacking, data flows from the sequence to its elements. There are
+   * three cases for what `toNode` should be:
+   *    a) If the element is a plain variable, `toNode` is the corresponding essa node.
+   *
+   *    b) If the element is itself a sequence, with control-flow node `seq`, `toNode` is `TIterableSequence(seq)`.
+   *
+   *    c) If the element is a starred variable, with control-flow node `v`, `toNode` is `TIterableElement(v)`.
+   */
+  predicate unpackingAssignmentElementReadStep(Node nodeFrom, Content c, Node nodeTo) {
+    exists(
+      UnpackingAssignmentSequenceTarget target, int index, ControlFlowNode element, int starIndex
+    |
+      target.getElement(starIndex) instanceof StarredNode
+      or
+      not exists(target.getAnElement().(StarredNode)) and
+      starIndex = -1
+    |
+      nodeFrom.asCfgNode() = target and
+      element = target.getElement(index) and
+      (
+        if starIndex = -1 or index < starIndex
+        then c.(TupleElementContent).getIndex() = index
+        else
+          // This could get big if big tuples exist
+          if index = starIndex
+          then c.(TupleElementContent).getIndex() >= index
+          else c.(TupleElementContent).getIndex() >= index - 1
+      ) and
+      (
+        if element instanceof SequenceNode
+        then
+          // Step 5b
+          nodeTo = TIterableSequenceNode(element)
+        else
+          if element instanceof StarredNode
+          then
+            // Step 5c
+            nodeTo = TIterableElementNode(element)
+          else
+            // Step 5a
+            nodeTo.asVar().getDefinition().(MultiAssignmentDefinition).getDefiningNode() = element
+      )
+    )
+  }
+
+  /**
+   * Step 6
+   * Data flows from `TIterableElement(v)` to the essa variable for `v`, with
+   * content type `ListElementContent`.
+   */
+  predicate unpackingAssignmentStarredElementStoreStep(Node nodeFrom, Content c, Node nodeTo) {
+    exists(ControlFlowNode starred | starred.getNode() instanceof Starred |
+      nodeFrom = TIterableElementNode(starred) and
+      nodeTo.asVar().getDefinition().(MultiAssignmentDefinition).getDefiningNode() = starred and
+      c instanceof ListElementContent
+    )
+  }
+
+  /** All read steps associated with unpacking assignment. */
+  predicate unpackingAssignmentReadStep(Node nodeFrom, Content c, Node nodeTo) {
+    unpackingAssignmentElementReadStep(nodeFrom, c, nodeTo)
+    or
+    unpackingAssignmentConvertingReadStep(nodeFrom, c, nodeTo)
+  }
+
+  /** All store steps associated with unpacking assignment. */
+  predicate unpackingAssignmentStoreStep(Node nodeFrom, Content c, Node nodeTo) {
+    unpackingAssignmentStarredElementStoreStep(nodeFrom, c, nodeTo)
+    or
+    unpackingAssignmentConvertingStoreStep(nodeFrom, c, nodeTo)
+  }
+}
+
+import UnpackingAssignment
+
 /** Data flows from a sequence to a call to `pop` on the sequence. */
 predicate popReadStep(CfgNode nodeFrom, Content c, CfgNode nodeTo) {
  // set.pop or list.pop
@@ -1139,7 +1477,7 @@ predicate attributeReadStep(CfgNode nodeFrom, AttributeContent c, CfgNode nodeTo
 predicate kwUnpackReadStep(CfgNode nodeFrom, DictionaryElementContent c, Node nodeTo) {
  exists(CallNode call, CallableValue callable, string name |
    nodeFrom.asCfgNode() = call.getNode().getKwargs().getAFlowNode() and
-    nodeTo = TKwUnpacked(call, callable, name) and
+    nodeTo = TKwUnpackedNode(call, callable, name) and
    name = c.getKey()
  )
 }
--- a/python/ql/src/semmle/python/dataflow/new/internal/DataFlowPublic.qll
+++ b/python/ql/src/semmle/python/dataflow/new/internal/DataFlowPublic.qll
@@ -58,9 +58,18 @@ newtype TNode =
   * That is, `call` contains argument `**{"foo": bar}` which is passed
   * to parameter `foo` of `callable`.
   */
-  TKwUnpacked(CallNode call, CallableValue callable, string name) {
+  TKwUnpackedNode(CallNode call, CallableValue callable, string name) {
    call_unpacks(call, _, callable, name, _)
-  }
+  } or
+  /**
+   * A synthetic node representing that an iterable sequence flows to consumer.
+   */
+  TIterableSequenceNode(UnpackingAssignmentSequenceTarget consumer) or
+  /**
+   * A synthetic node representing that there may be an iterable element
+   * for `consumer` to consume.
+   */
+  TIterableElementNode(UnpackingAssignmentTarget consumer)

 /** Helper for `Node::getEnclosingCallable`. */
 private DataFlowCallable getCallableScope(Scope s) {
@@ -338,11 +347,11 @@ class KwOverflowNode extends Node, TKwOverflowNode {
 * The node representing the synthetic argument of a call that is unpacked from a dictionary
 * argument.
 */
-class KwUnpacked extends Node, TKwUnpacked {
+class KwUnpackedNode extends Node, TKwUnpackedNode {
  CallNode call;
  string name;

-  KwUnpacked() { this = TKwUnpacked(call, _, name) }
+  KwUnpackedNode() { this = TKwUnpackedNode(call, _, name) }

  override string toString() { result = "KwUnpacked " + name }

@@ -356,6 +365,42 @@ class KwUnpacked extends Node, TKwUnpacked {
  override Location getLocation() { result = call.getLocation() }
 }

+/**
+ * A synthetic node representing an iterable sequence. Used for changing content type
+ * for instance from a `ListElement` to a `TupleElement`, especially if the content is
+ * transferred via a read step which cannot be broken up into a read and a store. The
+ * read step then targets TIterableSequence, and the conversion can happen via a read
+ * step to TIterableElement followed by a store step to the target.
+ */
+class IterableSequenceNode extends Node, TIterableSequenceNode {
+  CfgNode consumer;
+
+  IterableSequenceNode() { this = TIterableSequenceNode(consumer.getNode()) }
+
+  override string toString() { result = "IterableSequence" }
+
+  override DataFlowCallable getEnclosingCallable() { result = consumer.getEnclosingCallable() }
+
+  override Location getLocation() { result = consumer.getLocation() }
+}
+
+/**
+ * A synthetic node representing an iterable element. Used for changing content type
+ * for instance from a `ListElement` to a `TupleElement`. This would happen via a
+ * read step from the list to IterableElement followed by a store step to the tuple.
+ */
+class IterableElementNode extends Node, TIterableElementNode {
+  CfgNode consumer;
+
+  IterableElementNode() { this = TIterableElementNode(consumer.getNode()) }
+
+  override string toString() { result = "IterableElement" }
+
+  override DataFlowCallable getEnclosingCallable() { result = consumer.getEnclosingCallable() }
+
+  override Location getLocation() { result = consumer.getLocation() }
+}
+
 /**
 * A node that controls whether other nodes are evaluated.
 */