Merge pull request #4752 from yoff/python-dataflow-unpacking-assignment

Python: Dataflow, unpacking assignment
This commit is contained in:
Taus
2021-01-29 14:15:28 +01:00
committed by GitHub
4 changed files with 1018 additions and 103 deletions

View File

@@ -161,6 +161,15 @@ module EssaFlow {
nodeFrom.(CfgNode).getNode() =
nodeTo.(EssaNode).getVar().getDefinition().(AssignmentDefinition).getValue()
or
// Definition
// `[a, b] = iterable`
// nodeFrom = `iterable`, cfg node
// nodeTo = `TIterableSequence([a, b])`
exists(UnpackingAssignmentDirectTarget target |
nodeFrom.asExpr() = target.getValue() and
nodeTo = TIterableSequenceNode(target)
)
or
// With definition
// `with f(42) as x:`
// nodeFrom is `f(42)`, cfg node
@@ -174,6 +183,10 @@ module EssaFlow {
contextManager.strictlyDominates(var)
)
or
// Parameter definition
// `def foo(x):`
// nodeFrom is `x`, cfgNode
// nodeTo is `x`, essa var
exists(ParameterDefinition pd |
nodeFrom.asCfgNode() = pd.getDefiningNode() and
nodeTo.asVar() = pd.getVariable()
@@ -196,6 +209,9 @@ module EssaFlow {
// If expressions
nodeFrom.asCfgNode() = nodeTo.asCfgNode().(IfExprNode).getAnOperand()
or
// Flow inside an unpacking assignment
unpackingAssignmentFlowStep(nodeFrom, nodeTo)
or
// Overflow keyword argument
exists(CallNode call, CallableValue callable |
call = callable.getACall() and
@@ -454,7 +470,7 @@ module ArgumentPassing {
// argument unpacked from dict
exists(string name |
call_unpacks(call, mapping, callable, name, paramN) and
result = TKwUnpacked(call, callable, name)
result = TKwUnpackedNode(call, callable, name)
)
)
}
@@ -891,6 +907,8 @@ predicate storeStep(Node nodeFrom, Content c, Node nodeTo) {
or
comprehensionStoreStep(nodeFrom, c, nodeTo)
or
unpackingAssignmentStoreStep(nodeFrom, c, nodeTo)
or
attributeStoreStep(nodeFrom, c, nodeTo)
or
posOverflowStoreStep(nodeFrom, c, nodeTo)
@@ -906,6 +924,7 @@ predicate listStoreStep(CfgNode nodeFrom, ListElementContent c, CfgNode nodeTo)
// nodeTo is the list, `[..., 42, ...]`, cfg node
// c denotes element of list
nodeTo.getNode().(ListNode).getAnElement() = nodeFrom.getNode() and
not nodeTo.getNode() instanceof UnpackingAssignmentSequenceTarget and
// Suppress unused variable warning
c = c
}
@@ -931,6 +950,7 @@ predicate tupleStoreStep(CfgNode nodeFrom, TupleElementContent c, CfgNode nodeTo
// c denotes element of tuple and index of nodeFrom
exists(int n |
nodeTo.getNode().(TupleNode).getElement(n) = nodeFrom.getNode() and
not nodeTo.getNode() instanceof UnpackingAssignmentSequenceTarget and
c.getIndex() = n
)
}
@@ -1021,6 +1041,8 @@ predicate kwOverflowStoreStep(CfgNode nodeFrom, DictionaryElementContent c, Node
predicate readStep(Node nodeFrom, Content c, Node nodeTo) {
subscriptReadStep(nodeFrom, c, nodeTo)
or
unpackingAssignmentReadStep(nodeFrom, c, nodeTo)
or
popReadStep(nodeFrom, c, nodeTo)
or
comprehensionReadStep(nodeFrom, c, nodeTo)
@@ -1053,6 +1075,322 @@ predicate subscriptReadStep(CfgNode nodeFrom, Content c, CfgNode nodeTo) {
)
}
/**
* The unpacking assignment takes the general form
* ```python
* sequence = iterable
* ```
* where `sequence` is either a tuple or a list and it can contain wildcards.
* The iterable can be any iterable, which means that (CodeQL modeling of) content
* will need to change type if it should be transferred from the LHS to the RHS.
*
* Note that (CodeQL modeling of) content does not have to change type on data-flow
* paths _inside_ the LHS, as the different allowed syntaxes here are merely a convenience.
* Consequently, we model all LHS sequences as tuples, which have the more precise content
* model, making flow to the elements more precise. If an element is a starred variable,
* we will have to mutate the content type to be list content.
*
* We may for instance have
* ```python
* (a, b) = ["a", SOURCE] # RHS has content `ListElementContent`
* ```
* Due to the abstraction for list content, we do not know whether `SOURCE`
* ends up in `a` or in `b`, so we want to overapproximate and see it in both.
*
* Using wildcards we may have
* ```python
* (a, *b) = ("a", "b", SOURCE) # RHS has content `TupleElementContent(2)`
* ```
* Since the starred variables are always assigned (Python-)type list, `*b` will be
* `["b", SOURCE]`, and we will again overapproximate and assign it
* content corresponding to anything found in the RHS.
*
* For a precise transfer
* ```python
* (a, b) = ("a", SOURCE) # RHS has content `TupleElementContent(1)`
* ```
* we wish to keep the precision, so only `b` receives the tuple content at index 1.
*
* Finally, `sequence` is actually a pattern and can have a more complicated structure,
* such as
* ```python
* (a, [b, *c]) = ("a", ["b", SOURCE]) # RHS has content `TupleElementContent(1); ListElementContent`
* ```
* where `a` should not receive content, but `b` and `c` should. `c` will be `[SOURCE]` so
* should have the content transferred, while `b` should read it.
*
* To transfer content from RHS to the elements of the LHS in the expression `sequence = iterable`,
* we use two synthetic nodes:
*
* - `TIterableSequence(sequence)` which captures the content-modeling the entire `sequence` will have
* (essentially just a copy of the content-modeling the RHS has)
*
* - `TIterableElement(sequence)` which captures the content-modeling that will be assigned to an element.
* Note that an empty access path means that the value we are tracking flows directly to the element.
*
*
* The `TIterableSequence(sequence)` is at this point superflous but becomes useful when handling recursive
* structures in the LHS, where `sequence` is some internal sequence node. We can have a uniform treatment
* by always having these two synthetic nodes. So we transfer to (or, in the recursive case, read into)
* `TIterableSequence(sequence)`, from which we take a read step to `TIterableElement(sequence)` and then a
* store step to `sequence`.
*
* This allows the unknown content from the RHS to be read into `TIterableElement(sequence)` and tuple content
* to then be stored into `sequence`. If the content is already tuple content, this inderection creates crosstalk
* between indices. Therefore, tuple content is never read into `TIterableElement(sequence)`; it is instead
* transferred directly from `TIterableSequence(sequence)` to `sequence` via a flow step. Such a flow step will
* also transfer other content, but only tuple content is further read from `sequence` into its elements.
*
* The strategy is then via several read-, store-, and flow steps:
* 1. [Flow] Content is transferred from `iterable` to `TIterableSequence(sequence)` via a
* flow step. From here, everything happens on the LHS.
*
* 2. [Flow] Content is transferred from `TIterableSequence(sequence)` to `sequence` via a
* flow step. (Here only tuple content is relevant.)
*
* 3. [Read] Content is read from `TIterableSequence(sequence)` into `TIterableElement(sequence)`.
* As `sequence` is modeled as a tuple, we will not read tuple content as that would allow
* crosstalk.
*
* 4. [Store] Content is stored from `TIterableElement(sequence)` to `sequence`.
* Content type is `TupleElementContent` with indices taken from the syntax.
* For instance, if `sequence` is `(a, *b, c)`, content is written to index 0, 1, and 2.
* This is adequate as the route through `TIterableElement(sequence)` does not transfer precise content.
*
* 5. [Read] Content is read from `sequence` to its elements.
* a) If the element is a plain variable, the target is the corresponding essa node.
*
* b) If the element is itself a sequence, with control-flow node `seq`, the target is `TIterableSequence(seq)`.
*
* c) If the element is a starred variable, with control-flow node `v`, the target is `TIterableElement(v)`.
*
* 6. [Store] Content is stored from `TIterableElement(v)` to the essa variable for `v`, with
* content type `ListElementContent`.
*
* 7. [Flow, Read, Store] Steps 2 through 7 are repeated for all recursive elements which are sequences.
*
*
* We illustrate the above steps on the assignment
*
* ```python
* (a, b) = ["a", SOURCE]
* ```
*
* Looking at the content propagation to `a`:
* `["a", SOURCE]`: [ListElementContent]
*
* --Step 1-->
*
* `TIterableSequence((a, b))`: [ListElementContent]
*
* --Step 3-->
*
* `TIterableElement((a, b))`: []
*
* --Step 4-->
*
* `(a, b)`: [TupleElementContent(0)]
*
* --Step 5a-->
*
* `a`: []
*
* Meaning there is data-flow from the RHS to `a` (an over approximation). The same logic would be applied to show there is data-flow to `b`. Note that _Step 3_ and _Step 4_ would not have been needed if the RHS had been a tuple (since that would have been able to use _Step 2_ instead).
*
* Another, more complicated example:
* ```python
* (a, [b, *c]) = ["a", [SOURCE]]
* ```
* where the path to `c` is
*
* `["a", [SOURCE]]`: [ListElementContent; ListElementContent]
*
* --Step 1-->
*
* `TIterableSequence((a, [b, *c]))`: [ListElementContent; ListElementContent]
*
* --Step 3-->
*
* `TIterableElement((a, [b, *c]))`: [ListElementContent]
*
* --Step 4-->
*
* `(a, [b, *c])`: [TupleElementContent(1); ListElementContent]
*
* --Step 5b-->
*
* `TIterableSequence([b, *c])`: [ListElementContent]
*
* --Step 3-->
*
* `TIterableElement([b, *c])`: []
*
* --Step 4-->
*
* `[b, *c]`: [TupleElementContent(1)]
*
* --Step 5c-->
*
* `TIterableElement(c)`: []
*
* --Step 6-->
*
* `c`: [ListElementContent]
*/
module UnpackingAssignment {
/** A direct (or top-level) target of an unpacking assignment. */
class UnpackingAssignmentDirectTarget extends ControlFlowNode {
Expr value;
UnpackingAssignmentDirectTarget() {
this instanceof SequenceNode and
exists(Assign assign | this.getNode() = assign.getATarget() | value = assign.getValue())
}
Expr getValue() { result = value }
}
/** A (possibly recursive) target of an unpacking assignment. */
class UnpackingAssignmentTarget extends ControlFlowNode {
UnpackingAssignmentTarget() {
this instanceof UnpackingAssignmentDirectTarget
or
this = any(UnpackingAssignmentSequenceTarget parent).getAnElement()
}
}
/** A (possibly recursive) target of an unpacking assignment which is also a sequence. */
class UnpackingAssignmentSequenceTarget extends UnpackingAssignmentTarget {
UnpackingAssignmentSequenceTarget() { this instanceof SequenceNode }
ControlFlowNode getElement(int i) { result = this.(SequenceNode).getElement(i) }
ControlFlowNode getAnElement() { result = this.getElement(_) }
}
/**
* Step 2
* Data flows from `TIterableSequence(sequence)` to `sequence`
*/
predicate unpackingAssignmentFlowStep(Node nodeFrom, Node nodeTo) {
exists(UnpackingAssignmentSequenceTarget target |
nodeFrom = TIterableSequenceNode(target) and
nodeTo.asCfgNode() = target
)
}
/**
* Step 3
* Data flows from `TIterableSequence(sequence)` into `TIterableElement(sequence)`.
* As `sequence` is modeled as a tuple, we will not read tuple content as that would allow
* crosstalk.
*/
predicate unpackingAssignmentConvertingReadStep(Node nodeFrom, Content c, Node nodeTo) {
exists(UnpackingAssignmentSequenceTarget target |
nodeFrom = TIterableSequenceNode(target) and
nodeTo = TIterableElementNode(target) and
(
c instanceof ListElementContent
or
c instanceof SetElementContent
// TODO: dict content in iterable unpacking not handled
)
)
}
/**
* Step 4
* Data flows from `TIterableElement(sequence)` to `sequence`.
* Content type is `TupleElementContent` with indices taken from the syntax.
* For instance, if `sequence` is `(a, *b, c)`, content is written to index 0, 1, and 2.
*/
predicate unpackingAssignmentConvertingStoreStep(Node nodeFrom, Content c, Node nodeTo) {
exists(UnpackingAssignmentSequenceTarget target |
nodeFrom = TIterableElementNode(target) and
nodeTo.asCfgNode() = target and
exists(int index | exists(target.getElement(index)) |
c.(TupleElementContent).getIndex() = index
)
)
}
/**
* Step 5
* For a sequence node inside an iterable unpacking, data flows from the sequence to its elements. There are
* three cases for what `toNode` should be:
* a) If the element is a plain variable, `toNode` is the corresponding essa node.
*
* b) If the element is itself a sequence, with control-flow node `seq`, `toNode` is `TIterableSequence(seq)`.
*
* c) If the element is a starred variable, with control-flow node `v`, `toNode` is `TIterableElement(v)`.
*/
predicate unpackingAssignmentElementReadStep(Node nodeFrom, Content c, Node nodeTo) {
exists(
UnpackingAssignmentSequenceTarget target, int index, ControlFlowNode element, int starIndex
|
target.getElement(starIndex) instanceof StarredNode
or
not exists(target.getAnElement().(StarredNode)) and
starIndex = -1
|
nodeFrom.asCfgNode() = target and
element = target.getElement(index) and
(
if starIndex = -1 or index < starIndex
then c.(TupleElementContent).getIndex() = index
else
// This could get big if big tuples exist
if index = starIndex
then c.(TupleElementContent).getIndex() >= index
else c.(TupleElementContent).getIndex() >= index - 1
) and
(
if element instanceof SequenceNode
then
// Step 5b
nodeTo = TIterableSequenceNode(element)
else
if element instanceof StarredNode
then
// Step 5c
nodeTo = TIterableElementNode(element)
else
// Step 5a
nodeTo.asVar().getDefinition().(MultiAssignmentDefinition).getDefiningNode() = element
)
)
}
/**
* Step 6
* Data flows from `TIterableElement(v)` to the essa variable for `v`, with
* content type `ListElementContent`.
*/
predicate unpackingAssignmentStarredElementStoreStep(Node nodeFrom, Content c, Node nodeTo) {
exists(ControlFlowNode starred | starred.getNode() instanceof Starred |
nodeFrom = TIterableElementNode(starred) and
nodeTo.asVar().getDefinition().(MultiAssignmentDefinition).getDefiningNode() = starred and
c instanceof ListElementContent
)
}
/** All read steps associated with unpacking assignment. */
predicate unpackingAssignmentReadStep(Node nodeFrom, Content c, Node nodeTo) {
unpackingAssignmentElementReadStep(nodeFrom, c, nodeTo)
or
unpackingAssignmentConvertingReadStep(nodeFrom, c, nodeTo)
}
/** All store steps associated with unpacking assignment. */
predicate unpackingAssignmentStoreStep(Node nodeFrom, Content c, Node nodeTo) {
unpackingAssignmentStarredElementStoreStep(nodeFrom, c, nodeTo)
or
unpackingAssignmentConvertingStoreStep(nodeFrom, c, nodeTo)
}
}
import UnpackingAssignment
/** Data flows from a sequence to a call to `pop` on the sequence. */
predicate popReadStep(CfgNode nodeFrom, Content c, CfgNode nodeTo) {
// set.pop or list.pop
@@ -1139,7 +1477,7 @@ predicate attributeReadStep(CfgNode nodeFrom, AttributeContent c, CfgNode nodeTo
predicate kwUnpackReadStep(CfgNode nodeFrom, DictionaryElementContent c, Node nodeTo) {
exists(CallNode call, CallableValue callable, string name |
nodeFrom.asCfgNode() = call.getNode().getKwargs().getAFlowNode() and
nodeTo = TKwUnpacked(call, callable, name) and
nodeTo = TKwUnpackedNode(call, callable, name) and
name = c.getKey()
)
}

View File

@@ -58,9 +58,18 @@ newtype TNode =
* That is, `call` contains argument `**{"foo": bar}` which is passed
* to parameter `foo` of `callable`.
*/
TKwUnpacked(CallNode call, CallableValue callable, string name) {
TKwUnpackedNode(CallNode call, CallableValue callable, string name) {
call_unpacks(call, _, callable, name, _)
}
} or
/**
* A synthetic node representing that an iterable sequence flows to consumer.
*/
TIterableSequenceNode(UnpackingAssignmentSequenceTarget consumer) or
/**
* A synthetic node representing that there may be an iterable element
* for `consumer` to consume.
*/
TIterableElementNode(UnpackingAssignmentTarget consumer)
/** Helper for `Node::getEnclosingCallable`. */
private DataFlowCallable getCallableScope(Scope s) {
@@ -338,11 +347,11 @@ class KwOverflowNode extends Node, TKwOverflowNode {
* The node representing the synthetic argument of a call that is unpacked from a dictionary
* argument.
*/
class KwUnpacked extends Node, TKwUnpacked {
class KwUnpackedNode extends Node, TKwUnpackedNode {
CallNode call;
string name;
KwUnpacked() { this = TKwUnpacked(call, _, name) }
KwUnpackedNode() { this = TKwUnpackedNode(call, _, name) }
override string toString() { result = "KwUnpacked " + name }
@@ -356,6 +365,42 @@ class KwUnpacked extends Node, TKwUnpacked {
override Location getLocation() { result = call.getLocation() }
}
/**
* A synthetic node representing an iterable sequence. Used for changing content type
* for instance from a `ListElement` to a `TupleElement`, especially if the content is
* transferred via a read step which cannot be broken up into a read and a store. The
* read step then targets TIterableSequence, and the conversion can happen via a read
* step to TIterableElement followed by a store step to the target.
*/
class IterableSequenceNode extends Node, TIterableSequenceNode {
CfgNode consumer;
IterableSequenceNode() { this = TIterableSequenceNode(consumer.getNode()) }
override string toString() { result = "IterableSequence" }
override DataFlowCallable getEnclosingCallable() { result = consumer.getEnclosingCallable() }
override Location getLocation() { result = consumer.getLocation() }
}
/**
* A synthetic node representing an iterable element. Used for changing content type
* for instance from a `ListElement` to a `TupleElement`. This would happen via a
* read step from the list to IterableElement followed by a store step to the tuple.
*/
class IterableElementNode extends Node, TIterableElementNode {
CfgNode consumer;
IterableElementNode() { this = TIterableElementNode(consumer.getNode()) }
override string toString() { result = "IterableElement" }
override DataFlowCallable getEnclosingCallable() { result = consumer.getEnclosingCallable() }
override Location getLocation() { result = consumer.getLocation() }
}
/**
* A node that controls whether other nodes are evaluated.
*/