Files
codeql/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll
2022-02-02 11:19:35 +01:00

2044 lines
70 KiB
Plaintext

private import python
private import DataFlowPublic
import semmle.python.SpecialMethods
private import semmle.python.essa.SsaCompute
private import semmle.python.dataflow.new.internal.ImportStar
/** Gets the callable in which this node occurs. */
DataFlowCallable nodeGetEnclosingCallable(Node n) { result = n.getEnclosingCallable() }
/** A parameter position represented by an integer. */
class ParameterPosition extends int {
ParameterPosition() { exists(any(DataFlowCallable c).getParameter(this)) }
}
/** An argument position represented by an integer. */
class ArgumentPosition extends int {
ArgumentPosition() { exists(any(DataFlowCall c).getArg(this)) }
}
/** Holds if arguments at position `apos` match parameters at position `ppos`. */
pragma[inline]
predicate parameterMatch(ParameterPosition ppos, ArgumentPosition apos) { ppos = apos }
/** Holds if `p` is a `ParameterNode` of `c` with position `pos`. */
predicate isParameterNode(ParameterNode p, DataFlowCallable c, ParameterPosition pos) {
p.isParameterOf(c, pos)
}
/** Holds if `arg` is an `ArgumentNode` of `c` with position `pos`. */
predicate isArgumentNode(ArgumentNode arg, DataFlowCall c, ArgumentPosition pos) {
arg.argumentOf(c, pos)
}
//--------
// Data flow graph
//--------
//--------
// Nodes
//--------
predicate isExpressionNode(ControlFlowNode node) { node.getNode() instanceof Expr }
/** A module collecting the different reasons for synthesising a pre-update node. */
module syntheticPreUpdateNode {
class SyntheticPreUpdateNode extends Node, TSyntheticPreUpdateNode {
NeedsSyntheticPreUpdateNode post;
SyntheticPreUpdateNode() { this = TSyntheticPreUpdateNode(post) }
/** Gets the node for which this is a synthetic pre-update node. */
Node getPostUpdateNode() { result = post }
override string toString() { result = "[pre " + post.label() + "] " + post.toString() }
override Scope getScope() { result = post.getScope() }
override Location getLocation() { result = post.getLocation() }
}
/** A data flow node for which we should synthesise an associated pre-update node. */
class NeedsSyntheticPreUpdateNode extends PostUpdateNode {
NeedsSyntheticPreUpdateNode() { this = objectCreationNode() }
override Node getPreUpdateNode() { result.(SyntheticPreUpdateNode).getPostUpdateNode() = this }
/**
* A label for this kind of node. This will figure in the textual representation of the synthesized pre-update node.
*
* There is currently only one reason for needing a pre-update node, so we always use that as the label.
*/
string label() { result = "objCreate" }
}
/**
* Calls to constructors are treated as post-update nodes for the synthesized argument
* that is mapped to the `self` parameter. That way, constructor calls represent the value of the
* object after the constructor (currently only `__init__`) has run.
*/
CfgNode objectCreationNode() { result.getNode().(CallNode) = any(ClassCall c).getNode() }
}
import syntheticPreUpdateNode
/** A module collecting the different reasons for synthesising a post-update node. */
module syntheticPostUpdateNode {
/** A post-update node is synthesized for all nodes which satisfy `NeedsSyntheticPostUpdateNode`. */
class SyntheticPostUpdateNode extends PostUpdateNode, TSyntheticPostUpdateNode {
NeedsSyntheticPostUpdateNode pre;
SyntheticPostUpdateNode() { this = TSyntheticPostUpdateNode(pre) }
override Node getPreUpdateNode() { result = pre }
override string toString() { result = "[post " + pre.label() + "] " + pre.toString() }
override Scope getScope() { result = pre.getScope() }
override Location getLocation() { result = pre.getLocation() }
}
/** A data flow node for which we should synthesise an associated post-update node. */
class NeedsSyntheticPostUpdateNode extends Node {
NeedsSyntheticPostUpdateNode() {
this = argumentPreUpdateNode()
or
this = storePreUpdateNode()
or
this = readPreUpdateNode()
}
/**
* A label for this kind of node. This will figure in the textual representation of the synthesized post-update node.
* We favour being an arguments as the reason for the post-update node in case multiple reasons apply.
*/
string label() {
if this = argumentPreUpdateNode()
then result = "arg"
else
if this = storePreUpdateNode()
then result = "store"
else result = "read"
}
}
/**
* An argument might have its value changed as a result of a call.
* Certain arguments, such as implicit self arguments are already post-update nodes
* and should not have an extra node synthesised.
*/
ArgumentNode argumentPreUpdateNode() {
result = any(FunctionCall c).getArg(_)
or
// Avoid argument 0 of method calls as those have read post-update nodes.
exists(MethodCall c, int n | n > 0 | result = c.getArg(n))
or
result = any(SpecialCall c).getArg(_)
or
// Avoid argument 0 of class calls as those have non-synthetic post-update nodes.
exists(ClassCall c, int n | n > 0 | result = c.getArg(n))
}
/** An object might have its value changed after a store. */
CfgNode storePreUpdateNode() {
exists(Attribute a |
result.getNode() = a.getObject().getAFlowNode() and
a.getCtx() instanceof Store
)
}
/**
* A node marking the state change of an object after a read.
*
* A reverse read happens when the result of a read is modified, e.g. in
* ```python
* l = [ mutable ]
* l[0].mutate()
* ```
* we may now have changed the content of `l`. To track this, there must be
* a postupdate node for `l`.
*/
CfgNode readPreUpdateNode() {
exists(Attribute a |
result.getNode() = a.getObject().getAFlowNode() and
a.getCtx() instanceof Load
)
or
result.getNode() = any(SubscriptNode s).getObject()
or
// The dictionary argument is read from if the callable has parameters matching the keys.
result.getNode().getNode() = any(Call call).getKwargs()
}
}
import syntheticPostUpdateNode
class DataFlowExpr = Expr;
/**
* Flow between ESSA variables.
* This includes both local and global variables.
* Flow comes from definitions, uses and refinements.
*/
// TODO: Consider constraining `nodeFrom` and `nodeTo` to be in the same scope.
// If they have different enclosing callables, we get consistency errors.
module EssaFlow {
predicate essaFlowStep(Node nodeFrom, Node nodeTo) {
// Definition
// `x = f(42)`
// nodeFrom is `f(42)`, cfg node
// nodeTo is `x`, essa var
nodeFrom.(CfgNode).getNode() =
nodeTo.(EssaNode).getVar().getDefinition().(AssignmentDefinition).getValue()
or
// With definition
// `with f(42) as x:`
// nodeFrom is `f(42)`, cfg node
// nodeTo is `x`, essa var
exists(With with, ControlFlowNode contextManager, ControlFlowNode var |
nodeFrom.(CfgNode).getNode() = contextManager and
nodeTo.(EssaNode).getVar().getDefinition().(WithDefinition).getDefiningNode() = var and
// see `with_flow` in `python/ql/src/semmle/python/dataflow/Implementation.qll`
with.getContextExpr() = contextManager.getNode() and
with.getOptionalVars() = var.getNode() and
not with.isAsync() and
contextManager.strictlyDominates(var)
)
or
// Async with var definition
// `async with f(42) as x:`
// nodeFrom is `x`, cfg node
// nodeTo is `x`, essa var
//
// This makes the cfg node the local source of the awaited value.
exists(With with, ControlFlowNode var |
nodeFrom.(CfgNode).getNode() = var and
nodeTo.(EssaNode).getVar().getDefinition().(WithDefinition).getDefiningNode() = var and
with.getOptionalVars() = var.getNode() and
with.isAsync()
)
or
// Parameter definition
// `def foo(x):`
// nodeFrom is `x`, cfgNode
// nodeTo is `x`, essa var
exists(ParameterDefinition pd |
nodeFrom.asCfgNode() = pd.getDefiningNode() and
nodeTo.asVar() = pd.getVariable()
)
or
// First use after definition
// `y = 42`
// `x = f(y)`
// nodeFrom is `y` on first line, essa var
// nodeTo is `y` on second line, cfg node
defToFirstUse(nodeFrom.asVar(), nodeTo.asCfgNode())
or
// Next use after use
// `x = f(y)`
// `z = y + 1`
// nodeFrom is 'y' on first line, cfg node
// nodeTo is `y` on second line, cfg node
useToNextUse(nodeFrom.asCfgNode(), nodeTo.asCfgNode())
or
// If expressions
nodeFrom.asCfgNode() = nodeTo.asCfgNode().(IfExprNode).getAnOperand()
or
// boolean inline expressions such as `x or y` or `x and y`
nodeFrom.asCfgNode() = nodeTo.asCfgNode().(BoolExprNode).getAnOperand()
or
// Flow inside an unpacking assignment
iterableUnpackingFlowStep(nodeFrom, nodeTo)
or
matchFlowStep(nodeFrom, nodeTo)
or
// Overflow keyword argument
exists(CallNode call, CallableValue callable |
call = callable.getACall() and
nodeTo = TKwOverflowNode(call, callable) and
nodeFrom.asCfgNode() = call.getNode().getKwargs().getAFlowNode()
)
}
predicate useToNextUse(NameNode nodeFrom, NameNode nodeTo) {
AdjacentUses::adjacentUseUse(nodeFrom, nodeTo)
}
predicate defToFirstUse(EssaVariable var, NameNode nodeTo) {
AdjacentUses::firstUse(var.getDefinition(), nodeTo)
}
}
//--------
// Local flow
//--------
/**
* This is the local flow predicate that is used as a building block in global
* data flow.
*
* Local flow can happen either at import time, when the module is initialised
* or at runtime when callables in the module are called.
*/
predicate simpleLocalFlowStep(Node nodeFrom, Node nodeTo) {
// If there is local flow out of a node `node`, we want flow
// both out of `node` and any post-update node of `node`.
exists(Node node |
nodeFrom = update(node) and
(
importTimeLocalFlowStep(node, nodeTo) or
runtimeLocalFlowStep(node, nodeTo)
)
)
}
/**
* Holds if `node` is found at the top level of a module.
*/
pragma[inline]
predicate isTopLevel(Node node) { node.getScope() instanceof Module }
/** Holds if there is local flow from `nodeFrom` to `nodeTo` at import time. */
predicate importTimeLocalFlowStep(Node nodeFrom, Node nodeTo) {
// As a proxy for whether statements can be executed at import time,
// we check if they appear at the top level.
// This will miss statements inside functions called from the top level.
isTopLevel(nodeFrom) and
isTopLevel(nodeTo) and
EssaFlow::essaFlowStep(nodeFrom, nodeTo)
}
/** Holds if there is local flow from `nodeFrom` to `nodeTo` at runtime. */
predicate runtimeLocalFlowStep(Node nodeFrom, Node nodeTo) {
// Anything not at the top level can be executed at runtime.
not isTopLevel(nodeFrom) and
not isTopLevel(nodeTo) and
EssaFlow::essaFlowStep(nodeFrom, nodeTo)
}
/** `ModuleVariable`s are accessed via jump steps at runtime. */
predicate runtimeJumpStep(Node nodeFrom, Node nodeTo) {
// Module variable read
nodeFrom.(ModuleVariableNode).getARead() = nodeTo
or
// Module variable write
nodeFrom = nodeTo.(ModuleVariableNode).getAWrite()
or
// Setting the possible values of the variable at the end of import time
exists(SsaVariable def |
def = any(SsaVariable var).getAnUltimateDefinition() and
def.getDefinition() = nodeFrom.asCfgNode() and
def.getVariable() = nodeTo.(ModuleVariableNode).getVariable()
)
}
/**
* Holds if `result` is either `node`, or the post-update node for `node`.
*/
private Node update(Node node) {
result = node
or
result.(PostUpdateNode).getPreUpdateNode() = node
}
// TODO: Make modules for these headings
//--------
// Global flow
//--------
//
/**
* Computes routing of arguments to parameters
*
* When a call contains more positional arguments than there are positional parameters,
* the extra positional arguments are passed as a tuple to a starred parameter. This is
* achieved by synthesizing a node `TPosOverflowNode(call, callable)`
* that represents the tuple of extra positional arguments. There is a store step from each
* extra positional argument to this node.
*
* CURRENTLY NOT SUPPORTED:
* When a call contains an iterable unpacking argument, such as `func(*args)`, it is expanded into positional arguments.
*
* CURRENTLY NOT SUPPORTED:
* If a call contains an iterable unpacking argument, such as `func(*args)`, and the callee contains a starred argument, any extra
* positional arguments are passed to the starred argument.
*
* When a call contains keyword arguments that do not correspond to keyword parameters, these
* extra keyword arguments are passed as a dictionary to a doubly starred parameter. This is
* achieved by synthesizing a node `TKwOverflowNode(call, callable)`
* that represents the dictionary of extra keyword arguments. There is a store step from each
* extra keyword argument to this node.
*
* When a call contains a dictionary unpacking argument, such as `func(**kwargs)`, with entries corresponding to a keyword parameter,
* the value at such a key is unpacked and passed to the parameter. This is achieved
* by synthesizing an argument node `TKwUnpacked(call, callable, name)` representing the unpacked
* value. This node is used as the argument passed to the matching keyword parameter. There is a read
* step from the dictionary argument to the synthesized argument node.
*
* When a call contains a dictionary unpacking argument, such as `func(**kwargs)`, and the callee contains a doubly starred parameter,
* entries which are not unpacked are passed to the doubly starred parameter. This is achieved by
* adding a dataflow step from the dictionary argument to `TKwOverflowNode(call, callable)` and a
* step to clear content of that node at any unpacked keys.
*
* ## Examples:
* Assume that we have the callable
* ```python
* def f(x, y, *t, **d):
* pass
* ```
* Then the call
* ```python
* f(0, 1, 2, a=3)
* ```
* will be modeled as
* ```python
* f(0, 1, [*t], [**d])
* ```
* where `[` and `]` denotes synthesized nodes, so `[*t]` is the synthesized tuple argument
* `TPosOverflowNode` and `[**d]` is the synthesized dictionary argument `TKwOverflowNode`.
* There will be a store step from `2` to `[*t]` at pos `0` and one from `3` to `[**d]` at key
* `a`.
*
* For the call
* ```python
* f(0, **{"y": 1, "a": 3})
* ```
* no tuple argument is synthesized. It is modeled as
* ```python
* f(0, [y=1], [**d])
* ```
* where `[y=1]` is the synthesized unpacked argument `TKwUnpacked` (with `name` = `y`). There is
* a read step from `**{"y": 1, "a": 3}` to `[y=1]` at key `y` to get the value passed to the parameter
* `y`. There is a dataflow step from `**{"y": 1, "a": 3}` to `[**d]` to transfer the content and
* a clearing of content at key `y` for node `[**d]`, since that value has been unpacked.
*/
module ArgumentPassing {
/**
* Holds if `call` represents a `DataFlowCall` to a `DataFlowCallable` represented by `callable`.
*
* It _may not_ be the case that `call = callable.getACall()`, i.e. if `call` represents a `ClassCall`.
*
* Used to limit the size of predicates.
*/
predicate connects(CallNode call, CallableValue callable) {
exists(DataFlowCall c |
call = c.getNode() and
callable = c.getCallable().getCallableValue()
)
}
/**
* Gets the `n`th parameter of `callable`.
* If the callable has a starred parameter, say `*tuple`, that is matched with `n=-1`.
* If the callable has a doubly starred parameter, say `**dict`, that is matched with `n=-2`.
* Note that, unlike other languages, we do _not_ use -1 for the position of `self` in Python,
* as it is an explicit parameter at position 0.
*/
NameNode getParameter(CallableValue callable, int n) {
// positional parameter
result = callable.getParameter(n)
or
// starred parameter, `*tuple`
exists(Function f |
f = callable.getScope() and
n = -1 and
result = f.getVararg().getAFlowNode()
)
or
// doubly starred parameter, `**dict`
exists(Function f |
f = callable.getScope() and
n = -2 and
result = f.getKwarg().getAFlowNode()
)
}
/**
* A type representing a mapping from argument indices to parameter indices.
* We currently use two mappings: NoShift, the identity, used for ordinary
* function calls, and ShiftOneUp which is used for calls where an extra argument
* is inserted. These include method calls, constructor calls and class calls.
* In these calls, the argument at index `n` is mapped to the parameter at position `n+1`.
*/
newtype TArgParamMapping =
TNoShift() or
TShiftOneUp()
/** A mapping used for parameter passing. */
abstract class ArgParamMapping extends TArgParamMapping {
/** Gets the index of the parameter that corresponds to the argument at index `argN`. */
bindingset[argN]
abstract int getParamN(int argN);
/** Gets a textual representation of this element. */
abstract string toString();
}
/** A mapping that passes argument `n` to parameter `n`. */
class NoShift extends ArgParamMapping, TNoShift {
NoShift() { this = TNoShift() }
override string toString() { result = "NoShift [n -> n]" }
bindingset[argN]
override int getParamN(int argN) { result = argN }
}
/** A mapping that passes argument `n` to parameter `n+1`. */
class ShiftOneUp extends ArgParamMapping, TShiftOneUp {
ShiftOneUp() { this = TShiftOneUp() }
override string toString() { result = "ShiftOneUp [n -> n+1]" }
bindingset[argN]
override int getParamN(int argN) { result = argN + 1 }
}
/**
* Gets the node representing the argument to `call` that is passed to the parameter at
* (zero-based) index `paramN` in `callable`. If this is a positional argument, it must appear
* at an index, `argN`, in `call` wich satisfies `paramN = mapping.getParamN(argN)`.
*
* `mapping` will be the identity for function calls, but not for method- or constructor calls,
* where the first parameter is `self` and the first positional argument is passed to the second positional parameter.
* Similarly for classmethod calls, where the first parameter is `cls`.
*
* NOT SUPPORTED: Keyword-only parameters.
*/
Node getArg(CallNode call, ArgParamMapping mapping, CallableValue callable, int paramN) {
connects(call, callable) and
(
// positional argument
exists(int argN |
paramN = mapping.getParamN(argN) and
result = TCfgNode(call.getArg(argN))
)
or
// keyword argument
// TODO: Since `getArgName` have no results for keyword-only parameters,
// these are currently not supported.
exists(Function f, string argName |
f = callable.getScope() and
f.getArgName(paramN) = argName and
result = TCfgNode(call.getArgByName(unbind_string(argName)))
)
or
// a synthezised argument passed to the starred parameter (at position -1)
callable.getScope().hasVarArg() and
paramN = -1 and
result = TPosOverflowNode(call, callable)
or
// a synthezised argument passed to the doubly starred parameter (at position -2)
callable.getScope().hasKwArg() and
paramN = -2 and
result = TKwOverflowNode(call, callable)
or
// argument unpacked from dict
exists(string name |
call_unpacks(call, mapping, callable, name, paramN) and
result = TKwUnpackedNode(call, callable, name)
)
)
}
/** Currently required in `getArg` in order to prevent a bad join. */
bindingset[result, s]
private string unbind_string(string s) { result <= s and s <= result }
/** Gets the control flow node that is passed as the `n`th overflow positional argument. */
ControlFlowNode getPositionalOverflowArg(CallNode call, CallableValue callable, int n) {
connects(call, callable) and
exists(Function f, int posCount, int argNr |
f = callable.getScope() and
f.hasVarArg() and
posCount = f.getPositionalParameterCount() and
result = call.getArg(argNr) and
argNr >= posCount and
argNr = posCount + n
)
}
/** Gets the control flow node that is passed as the overflow keyword argument with key `key`. */
ControlFlowNode getKeywordOverflowArg(CallNode call, CallableValue callable, string key) {
connects(call, callable) and
exists(Function f |
f = callable.getScope() and
f.hasKwArg() and
not exists(f.getArgByName(key)) and
result = call.getArgByName(key)
)
}
/**
* Holds if `call` unpacks a dictionary argument in order to pass it via `name`.
* It will then be passed to the parameter of `callable` at index `paramN`.
*/
predicate call_unpacks(
CallNode call, ArgParamMapping mapping, CallableValue callable, string name, int paramN
) {
connects(call, callable) and
exists(Function f |
f = callable.getScope() and
not exists(int argN | paramN = mapping.getParamN(argN) | exists(call.getArg(argN))) and // no positional argument available
name = f.getArgName(paramN) and
// not exists(call.getArgByName(name)) and // only matches keyword arguments not preceded by **
// TODO: make the below logic respect control flow splitting (by not going to the AST).
not call.getNode().getANamedArg().(Keyword).getArg() = name and // no keyword argument available
paramN >= 0 and
paramN < f.getPositionalParameterCount() + f.getKeywordOnlyParameterCount() and
exists(call.getNode().getKwargs()) // dict argument available
)
}
}
import ArgumentPassing
/**
* IPA type for DataFlowCallable.
*
* A callable is either a function value, a class value, or a module (for enclosing `ModuleVariableNode`s).
* A module has no calls.
*/
newtype TDataFlowCallable =
TCallableValue(CallableValue callable) {
callable instanceof FunctionValue and
not callable.(FunctionValue).isLambda()
or
callable instanceof ClassValue
} or
TLambda(Function lambda) { lambda.isLambda() } or
TModule(Module m)
/** Represents a callable. */
abstract class DataFlowCallable extends TDataFlowCallable {
/** Gets a textual representation of this element. */
abstract string toString();
/** Gets a call to this callable. */
abstract CallNode getACall();
/** Gets the scope of this callable */
abstract Scope getScope();
/** Gets the specified parameter of this callable */
abstract NameNode getParameter(int n);
/** Gets the name of this callable. */
abstract string getName();
/** Gets a callable value for this callable, if one exists. */
abstract CallableValue getCallableValue();
}
/** A class representing a callable value. */
class DataFlowCallableValue extends DataFlowCallable, TCallableValue {
CallableValue callable;
DataFlowCallableValue() { this = TCallableValue(callable) }
override string toString() { result = callable.toString() }
override CallNode getACall() { result = callable.getACall() }
override Scope getScope() { result = callable.getScope() }
override NameNode getParameter(int n) { result = getParameter(callable, n) }
override string getName() { result = callable.getName() }
override CallableValue getCallableValue() { result = callable }
}
/** A class representing a callable lambda. */
class DataFlowLambda extends DataFlowCallable, TLambda {
Function lambda;
DataFlowLambda() { this = TLambda(lambda) }
override string toString() { result = lambda.toString() }
override CallNode getACall() { result = this.getCallableValue().getACall() }
override Scope getScope() { result = lambda.getEvaluatingScope() }
override NameNode getParameter(int n) { result = getParameter(this.getCallableValue(), n) }
override string getName() { result = "Lambda callable" }
override FunctionValue getCallableValue() {
result.getOrigin().getNode() = lambda.getDefinition()
}
}
/** A class representing the scope in which a `ModuleVariableNode` appears. */
class DataFlowModuleScope extends DataFlowCallable, TModule {
Module mod;
DataFlowModuleScope() { this = TModule(mod) }
override string toString() { result = mod.toString() }
override CallNode getACall() { none() }
override Scope getScope() { result = mod }
override NameNode getParameter(int n) { none() }
override string getName() { result = mod.getName() }
override CallableValue getCallableValue() { none() }
}
/**
* IPA type for DataFlowCall.
*
* Calls corresponding to `CallNode`s are either to callable values or to classes.
* The latter is directed to the callable corresponding to the `__init__` method of the class.
*
* An `__init__` method can also be called directly, so that the callable can be targeted by
* different types of calls. In that case, the parameter mappings will be different,
* as the class call will synthesize an argument node to be mapped to the `self` parameter.
*
* A call corresponding to a special method call is handled by the corresponding `SpecialMethodCallNode`.
*
* TODO: Add `TClassMethodCall` mapping `cls` appropriately.
*/
newtype TDataFlowCall =
TFunctionCall(CallNode call) { call = any(FunctionValue f).getAFunctionCall() } or
/** Bound methods need to make room for the explicit self parameter */
TMethodCall(CallNode call) { call = any(FunctionValue f).getAMethodCall() } or
TClassCall(CallNode call) { call = any(ClassValue c).getACall() } or
TSpecialCall(SpecialMethodCallNode special)
/** Represents a call. */
abstract class DataFlowCall extends TDataFlowCall {
/** Gets a textual representation of this element. */
abstract string toString();
/** Get the callable to which this call goes. */
abstract DataFlowCallable getCallable();
/**
* Gets the argument to this call that will be sent
* to the `n`th parameter of the callable.
*/
abstract Node getArg(int n);
/** Get the control flow node representing this call. */
abstract ControlFlowNode getNode();
/** Gets the enclosing callable of this call. */
abstract DataFlowCallable getEnclosingCallable();
/** Gets the location of this dataflow call. */
Location getLocation() { result = this.getNode().getLocation() }
}
/**
* Represents a call to a function/lambda.
* This excludes calls to bound methods, classes, and special methods.
* Bound method calls and class calls insert an argument for the explicit
* `self` parameter, and special method calls have special argument passing.
*/
class FunctionCall extends DataFlowCall, TFunctionCall {
CallNode call;
DataFlowCallable callable;
FunctionCall() {
this = TFunctionCall(call) and
call = callable.getACall()
}
override string toString() { result = call.toString() }
override Node getArg(int n) { result = getArg(call, TNoShift(), callable.getCallableValue(), n) }
override ControlFlowNode getNode() { result = call }
override DataFlowCallable getCallable() { result = callable }
override DataFlowCallable getEnclosingCallable() { result.getScope() = call.getNode().getScope() }
}
/**
* Represents a call to a bound method call.
* The node representing the instance is inserted as argument to the `self` parameter.
*/
class MethodCall extends DataFlowCall, TMethodCall {
CallNode call;
FunctionValue bm;
MethodCall() {
this = TMethodCall(call) and
call = bm.getACall()
}
private CallableValue getCallableValue() { result = bm }
override string toString() { result = call.toString() }
override Node getArg(int n) {
n > 0 and result = getArg(call, TShiftOneUp(), this.getCallableValue(), n)
or
n = 0 and result = TCfgNode(call.getFunction().(AttrNode).getObject())
}
override ControlFlowNode getNode() { result = call }
override DataFlowCallable getCallable() { result = TCallableValue(this.getCallableValue()) }
override DataFlowCallable getEnclosingCallable() { result.getScope() = call.getScope() }
}
/**
* Represents a call to a class.
* The pre-update node for the call is inserted as argument to the `self` parameter.
* That makes the call node be the post-update node holding the value of the object
* after the constructor has run.
*/
class ClassCall extends DataFlowCall, TClassCall {
CallNode call;
ClassValue c;
ClassCall() {
this = TClassCall(call) and
call = c.getACall()
}
private CallableValue getCallableValue() { c.getScope().getInitMethod() = result.getScope() }
override string toString() { result = call.toString() }
override Node getArg(int n) {
n > 0 and result = getArg(call, TShiftOneUp(), this.getCallableValue(), n)
or
n = 0 and result = TSyntheticPreUpdateNode(TCfgNode(call))
}
override ControlFlowNode getNode() { result = call }
override DataFlowCallable getCallable() { result = TCallableValue(this.getCallableValue()) }
override DataFlowCallable getEnclosingCallable() { result.getScope() = call.getScope() }
}
/** Represents a call to a special method. */
class SpecialCall extends DataFlowCall, TSpecialCall {
SpecialMethodCallNode special;
SpecialCall() { this = TSpecialCall(special) }
override string toString() { result = special.toString() }
override Node getArg(int n) { result = TCfgNode(special.(SpecialMethod::Potential).getArg(n)) }
override ControlFlowNode getNode() { result = special }
override DataFlowCallable getCallable() {
result = TCallableValue(special.getResolvedSpecialMethod())
}
override DataFlowCallable getEnclosingCallable() {
result.getScope() = special.getNode().getScope()
}
}
/** Gets a viable run-time target for the call `call`. */
DataFlowCallable viableCallable(DataFlowCall call) { result = call.getCallable() }
private newtype TReturnKind = TNormalReturnKind()
/**
* A return kind. A return kind describes how a value can be returned
* from a callable. For Python, this is simply a method return.
*/
class ReturnKind extends TReturnKind {
/** Gets a textual representation of this element. */
string toString() { result = "return" }
}
/** A data flow node that represents a value returned by a callable. */
class ReturnNode extends CfgNode {
Return ret;
// See `TaintTrackingImplementation::returnFlowStep`
ReturnNode() { node = ret.getValue().getAFlowNode() }
/** Gets the kind of this return node. */
ReturnKind getKind() { any() }
}
/** A data flow node that represents the output of a call. */
class OutNode extends CfgNode {
OutNode() { node instanceof CallNode }
}
/**
* Gets a node that can read the value returned from `call` with return kind
* `kind`.
*/
OutNode getAnOutNode(DataFlowCall call, ReturnKind kind) {
call.getNode() = result.getNode() and
kind = TNormalReturnKind()
}
//--------
// Type pruning
//--------
newtype TDataFlowType = TAnyFlow()
class DataFlowType extends TDataFlowType {
/** Gets a textual representation of this element. */
string toString() { result = "DataFlowType" }
}
/** A node that performs a type cast. */
class CastNode extends Node {
// We include read- and store steps here to force them to be
// shown in path explanations.
// This hack is necessary, because we have included some of these
// steps as default taint steps, making them be suppressed in path
// explanations.
// We should revert this once, we can remove this steps from the
// default taint steps; this should be possible once we have
// implemented flow summaries and recursive content.
CastNode() { readStep(_, _, this) or storeStep(_, _, this) }
}
/**
* Holds if `t1` and `t2` are compatible, that is, whether data can flow from
* a node of type `t1` to a node of type `t2`.
*/
pragma[inline]
predicate compatibleTypes(DataFlowType t1, DataFlowType t2) { any() }
/**
* Gets the type of `node`.
*/
DataFlowType getNodeType(Node node) {
result = TAnyFlow() and
// Suppress unused variable warning
node = node
}
/** Gets a string representation of a type returned by `getErasedRepr`. */
string ppReprType(DataFlowType t) { none() }
//--------
// Extra flow
//--------
/**
* Holds if `pred` can flow to `succ`, by jumping from one callable to
* another. Additional steps specified by the configuration are *not*
* taken into account.
*/
predicate jumpStep(Node nodeFrom, Node nodeTo) {
runtimeJumpStep(nodeFrom, nodeTo)
or
// Read of module attribute:
exists(AttrRead r, ModuleValue mv |
r.getObject().asCfgNode().pointsTo(mv) and
module_export(mv.getScope(), r.getAttributeName(), nodeFrom) and
nodeTo = r
)
or
// Default value for parameter flows to that parameter
defaultValueFlowStep(nodeFrom, nodeTo)
}
/**
* Holds if the module `m` defines a name `name` by assigning `defn` to it. This is an
* overapproximation, as `name` may not in fact be exported (e.g. by defining an `__all__` that does
* not include `name`).
*/
private predicate module_export(Module m, string name, CfgNode defn) {
exists(EssaVariable v |
v.getName() = name and
v.getAUse() = ImportStar::getStarImported*(m).getANormalExit()
|
defn.getNode() = v.getDefinition().(AssignmentDefinition).getValue()
or
defn.getNode() = v.getDefinition().(ArgumentRefinement).getArgument()
)
}
//--------
// Field flow
//--------
/**
* Holds if data can flow from `nodeFrom` to `nodeTo` via an assignment to
* content `c`.
*/
predicate storeStep(Node nodeFrom, Content c, Node nodeTo) {
listStoreStep(nodeFrom, c, nodeTo)
or
setStoreStep(nodeFrom, c, nodeTo)
or
tupleStoreStep(nodeFrom, c, nodeTo)
or
dictStoreStep(nodeFrom, c, nodeTo)
or
comprehensionStoreStep(nodeFrom, c, nodeTo)
or
iterableUnpackingStoreStep(nodeFrom, c, nodeTo)
or
attributeStoreStep(nodeFrom, c, nodeTo)
or
posOverflowStoreStep(nodeFrom, c, nodeTo)
or
kwOverflowStoreStep(nodeFrom, c, nodeTo)
or
matchStoreStep(nodeFrom, c, nodeTo)
}
/** Data flows from an element of a list to the list. */
predicate listStoreStep(CfgNode nodeFrom, ListElementContent c, CfgNode nodeTo) {
// List
// `[..., 42, ...]`
// nodeFrom is `42`, cfg node
// nodeTo is the list, `[..., 42, ...]`, cfg node
// c denotes element of list
nodeTo.getNode().(ListNode).getAnElement() = nodeFrom.getNode() and
not nodeTo.getNode() instanceof UnpackingAssignmentSequenceTarget and
// Suppress unused variable warning
c = c
}
/** Data flows from an element of a set to the set. */
predicate setStoreStep(CfgNode nodeFrom, ListElementContent c, CfgNode nodeTo) {
// Set
// `{..., 42, ...}`
// nodeFrom is `42`, cfg node
// nodeTo is the set, `{..., 42, ...}`, cfg node
// c denotes element of list
nodeTo.getNode().(SetNode).getAnElement() = nodeFrom.getNode() and
// Suppress unused variable warning
c = c
}
/** Data flows from an element of a tuple to the tuple at a specific index. */
predicate tupleStoreStep(CfgNode nodeFrom, TupleElementContent c, CfgNode nodeTo) {
// Tuple
// `(..., 42, ...)`
// nodeFrom is `42`, cfg node
// nodeTo is the tuple, `(..., 42, ...)`, cfg node
// c denotes element of tuple and index of nodeFrom
exists(int n |
nodeTo.getNode().(TupleNode).getElement(n) = nodeFrom.getNode() and
not nodeTo.getNode() instanceof UnpackingAssignmentSequenceTarget and
c.getIndex() = n
)
}
/** Data flows from an element of a dictionary to the dictionary at a specific key. */
predicate dictStoreStep(CfgNode nodeFrom, DictionaryElementContent c, CfgNode nodeTo) {
// Dictionary
// `{..., "key" = 42, ...}`
// nodeFrom is `42`, cfg node
// nodeTo is the dict, `{..., "key" = 42, ...}`, cfg node
// c denotes element of dictionary and the key `"key"`
exists(KeyValuePair item |
item = nodeTo.getNode().(DictNode).getNode().(Dict).getAnItem() and
nodeFrom.getNode().getNode() = item.getValue() and
c.getKey() = item.getKey().(StrConst).getS()
)
}
/** Data flows from an element expression in a comprehension to the comprehension. */
predicate comprehensionStoreStep(CfgNode nodeFrom, Content c, CfgNode nodeTo) {
// Comprehension
// `[x+1 for x in l]`
// nodeFrom is `x+1`, cfg node
// nodeTo is `[x+1 for x in l]`, cfg node
// c denotes list or set or dictionary without index
//
// List
nodeTo.getNode().getNode().(ListComp).getElt() = nodeFrom.getNode().getNode() and
c instanceof ListElementContent
or
// Set
nodeTo.getNode().getNode().(SetComp).getElt() = nodeFrom.getNode().getNode() and
c instanceof SetElementContent
or
// Dictionary
nodeTo.getNode().getNode().(DictComp).getElt() = nodeFrom.getNode().getNode() and
c instanceof DictionaryElementAnyContent
or
// Generator
nodeTo.getNode().getNode().(GeneratorExp).getElt() = nodeFrom.getNode().getNode() and
c instanceof ListElementContent
}
/**
* Holds if `nodeFrom` flows into an attribute (corresponding to `c`) of `nodeTo` via an attribute assignment.
*
* For example, in
* ```python
* obj.foo = x
* ```
* data flows from `x` to (the post-update node for) `obj` via assignment to `foo`.
*/
predicate attributeStoreStep(CfgNode nodeFrom, AttributeContent c, PostUpdateNode nodeTo) {
exists(AttrNode attr |
nodeFrom.asCfgNode() = attr.(DefinitionNode).getValue() and
attr.getName() = c.getAttribute() and
attr.getObject() = nodeTo.getPreUpdateNode().(CfgNode).getNode()
)
}
/**
* Holds if `nodeFrom` flows into the synthezised positional overflow argument (`nodeTo`)
* at the position indicated by `c`.
*/
predicate posOverflowStoreStep(CfgNode nodeFrom, TupleElementContent c, Node nodeTo) {
exists(CallNode call, CallableValue callable, int n |
nodeFrom.asCfgNode() = getPositionalOverflowArg(call, callable, n) and
nodeTo = TPosOverflowNode(call, callable) and
c.getIndex() = n
)
}
/**
* Holds if `nodeFrom` flows into the synthezised keyword overflow argument (`nodeTo`)
* at the key indicated by `c`.
*/
predicate kwOverflowStoreStep(CfgNode nodeFrom, DictionaryElementContent c, Node nodeTo) {
exists(CallNode call, CallableValue callable, string key |
nodeFrom.asCfgNode() = getKeywordOverflowArg(call, callable, key) and
nodeTo = TKwOverflowNode(call, callable) and
c.getKey() = key
)
}
predicate defaultValueFlowStep(CfgNode nodeFrom, CfgNode nodeTo) {
exists(Function f, Parameter p, ParameterDefinition def |
// `getArgByName` supports, unlike `getAnArg`, keyword-only parameters
p = f.getArgByName(_) and
nodeFrom.asExpr() = p.getDefault() and
// The following expresses
// nodeTo.(ParameterNode).getParameter() = p
// without non-monotonic recursion
def.getParameter() = p and
nodeTo.getNode() = def.getDefiningNode()
)
}
/**
* Holds if data can flow from `nodeFrom` to `nodeTo` via a read of content `c`.
*/
predicate readStep(Node nodeFrom, Content c, Node nodeTo) {
subscriptReadStep(nodeFrom, c, nodeTo)
or
iterableUnpackingReadStep(nodeFrom, c, nodeTo)
or
matchReadStep(nodeFrom, c, nodeTo)
or
popReadStep(nodeFrom, c, nodeTo)
or
forReadStep(nodeFrom, c, nodeTo)
or
attributeReadStep(nodeFrom, c, nodeTo)
or
kwUnpackReadStep(nodeFrom, c, nodeTo)
}
/** Data flows from a sequence to a subscript of the sequence. */
predicate subscriptReadStep(CfgNode nodeFrom, Content c, CfgNode nodeTo) {
// Subscript
// `l[3]`
// nodeFrom is `l`, cfg node
// nodeTo is `l[3]`, cfg node
// c is compatible with 3
nodeFrom.getNode() = nodeTo.getNode().(SubscriptNode).getObject() and
(
c instanceof ListElementContent
or
c instanceof SetElementContent
or
c instanceof DictionaryElementAnyContent
or
c.(TupleElementContent).getIndex() =
nodeTo.getNode().(SubscriptNode).getIndex().getNode().(IntegerLiteral).getValue()
or
c.(DictionaryElementContent).getKey() =
nodeTo.getNode().(SubscriptNode).getIndex().getNode().(StrConst).getS()
)
}
/**
* The unpacking assignment takes the general form
* ```python
* sequence = iterable
* ```
* where `sequence` is either a tuple or a list and it can contain wildcards.
* The iterable can be any iterable, which means that (CodeQL modeling of) content
* will need to change type if it should be transferred from the LHS to the RHS.
*
* Note that (CodeQL modeling of) content does not have to change type on data-flow
* paths _inside_ the LHS, as the different allowed syntaxes here are merely a convenience.
* Consequently, we model all LHS sequences as tuples, which have the more precise content
* model, making flow to the elements more precise. If an element is a starred variable,
* we will have to mutate the content type to be list content.
*
* We may for instance have
* ```python
* (a, b) = ["a", SOURCE] # RHS has content `ListElementContent`
* ```
* Due to the abstraction for list content, we do not know whether `SOURCE`
* ends up in `a` or in `b`, so we want to overapproximate and see it in both.
*
* Using wildcards we may have
* ```python
* (a, *b) = ("a", "b", SOURCE) # RHS has content `TupleElementContent(2)`
* ```
* Since the starred variables are always assigned (Python-)type list, `*b` will be
* `["b", SOURCE]`, and we will again overapproximate and assign it
* content corresponding to anything found in the RHS.
*
* For a precise transfer
* ```python
* (a, b) = ("a", SOURCE) # RHS has content `TupleElementContent(1)`
* ```
* we wish to keep the precision, so only `b` receives the tuple content at index 1.
*
* Finally, `sequence` is actually a pattern and can have a more complicated structure,
* such as
* ```python
* (a, [b, *c]) = ("a", ["b", SOURCE]) # RHS has content `TupleElementContent(1); ListElementContent`
* ```
* where `a` should not receive content, but `b` and `c` should. `c` will be `[SOURCE]` so
* should have the content transferred, while `b` should read it.
*
* To transfer content from RHS to the elements of the LHS in the expression `sequence = iterable`,
* we use two synthetic nodes:
*
* - `TIterableSequence(sequence)` which captures the content-modeling the entire `sequence` will have
* (essentially just a copy of the content-modeling the RHS has)
*
* - `TIterableElement(sequence)` which captures the content-modeling that will be assigned to an element.
* Note that an empty access path means that the value we are tracking flows directly to the element.
*
*
* The `TIterableSequence(sequence)` is at this point superflous but becomes useful when handling recursive
* structures in the LHS, where `sequence` is some internal sequence node. We can have a uniform treatment
* by always having these two synthetic nodes. So we transfer to (or, in the recursive case, read into)
* `TIterableSequence(sequence)`, from which we take a read step to `TIterableElement(sequence)` and then a
* store step to `sequence`.
*
* This allows the unknown content from the RHS to be read into `TIterableElement(sequence)` and tuple content
* to then be stored into `sequence`. If the content is already tuple content, this inderection creates crosstalk
* between indices. Therefore, tuple content is never read into `TIterableElement(sequence)`; it is instead
* transferred directly from `TIterableSequence(sequence)` to `sequence` via a flow step. Such a flow step will
* also transfer other content, but only tuple content is further read from `sequence` into its elements.
*
* The strategy is then via several read-, store-, and flow steps:
* 1. a) [Flow] Content is transferred from `iterable` to `TIterableSequence(sequence)` via a
* flow step. From here, everything happens on the LHS.
*
* b) [Read] If the unpacking happens inside a for as in
* ```python
* for sequence in iterable
* ```
* then content is read from `iterable` to `TIterableSequence(sequence)`.
*
* 2. [Flow] Content is transferred from `TIterableSequence(sequence)` to `sequence` via a
* flow step. (Here only tuple content is relevant.)
*
* 3. [Read] Content is read from `TIterableSequence(sequence)` into `TIterableElement(sequence)`.
* As `sequence` is modeled as a tuple, we will not read tuple content as that would allow
* crosstalk.
*
* 4. [Store] Content is stored from `TIterableElement(sequence)` to `sequence`.
* Content type is `TupleElementContent` with indices taken from the syntax.
* For instance, if `sequence` is `(a, *b, c)`, content is written to index 0, 1, and 2.
* This is adequate as the route through `TIterableElement(sequence)` does not transfer precise content.
*
* 5. [Read] Content is read from `sequence` to its elements.
* a) If the element is a plain variable, the target is the corresponding essa node.
*
* b) If the element is itself a sequence, with control-flow node `seq`, the target is `TIterableSequence(seq)`.
*
* c) If the element is a starred variable, with control-flow node `v`, the target is `TIterableElement(v)`.
*
* 6. [Store] Content is stored from `TIterableElement(v)` to the essa variable for `v`, with
* content type `ListElementContent`.
*
* 7. [Flow, Read, Store] Steps 2 through 7 are repeated for all recursive elements which are sequences.
*
*
* We illustrate the above steps on the assignment
*
* ```python
* (a, b) = ["a", SOURCE]
* ```
*
* Looking at the content propagation to `a`:
* `["a", SOURCE]`: [ListElementContent]
*
* --Step 1a-->
*
* `TIterableSequence((a, b))`: [ListElementContent]
*
* --Step 3-->
*
* `TIterableElement((a, b))`: []
*
* --Step 4-->
*
* `(a, b)`: [TupleElementContent(0)]
*
* --Step 5a-->
*
* `a`: []
*
* Meaning there is data-flow from the RHS to `a` (an over approximation). The same logic would be applied to show there is data-flow to `b`. Note that _Step 3_ and _Step 4_ would not have been needed if the RHS had been a tuple (since that would have been able to use _Step 2_ instead).
*
* Another, more complicated example:
* ```python
* (a, [b, *c]) = ["a", [SOURCE]]
* ```
* where the path to `c` is
*
* `["a", [SOURCE]]`: [ListElementContent; ListElementContent]
*
* --Step 1a-->
*
* `TIterableSequence((a, [b, *c]))`: [ListElementContent; ListElementContent]
*
* --Step 3-->
*
* `TIterableElement((a, [b, *c]))`: [ListElementContent]
*
* --Step 4-->
*
* `(a, [b, *c])`: [TupleElementContent(1); ListElementContent]
*
* --Step 5b-->
*
* `TIterableSequence([b, *c])`: [ListElementContent]
*
* --Step 3-->
*
* `TIterableElement([b, *c])`: []
*
* --Step 4-->
*
* `[b, *c]`: [TupleElementContent(1)]
*
* --Step 5c-->
*
* `TIterableElement(c)`: []
*
* --Step 6-->
*
* `c`: [ListElementContent]
*/
module IterableUnpacking {
/**
* The target of a `for`, e.g. `x` in `for x in list` or in `[42 for x in list]`.
* This class also records the source, which in both above cases is `list`.
* This class abstracts away the differing representations of comprehensions and
* for statements.
*/
class ForTarget extends ControlFlowNode {
Expr source;
ForTarget() {
exists(For for |
source = for.getIter() and
this.getNode() = for.getTarget() and
not for = any(Comp comp).getNthInnerLoop(0)
)
or
exists(Comp comp |
source = comp.getIterable() and
this.getNode() = comp.getNthInnerLoop(0).getTarget()
)
}
Expr getSource() { result = source }
}
/** The LHS of an assignment, it also records the assigned value. */
class AssignmentTarget extends ControlFlowNode {
Expr value;
AssignmentTarget() {
exists(Assign assign | this.getNode() = assign.getATarget() | value = assign.getValue())
}
Expr getValue() { result = value }
}
/** A direct (or top-level) target of an unpacking assignment. */
class UnpackingAssignmentDirectTarget extends ControlFlowNode {
Expr value;
UnpackingAssignmentDirectTarget() {
this instanceof SequenceNode and
(
value = this.(AssignmentTarget).getValue()
or
value = this.(ForTarget).getSource()
)
}
Expr getValue() { result = value }
}
/** A (possibly recursive) target of an unpacking assignment. */
class UnpackingAssignmentTarget extends ControlFlowNode {
UnpackingAssignmentTarget() {
this instanceof UnpackingAssignmentDirectTarget
or
this = any(UnpackingAssignmentSequenceTarget parent).getAnElement()
}
}
/** A (possibly recursive) target of an unpacking assignment which is also a sequence. */
class UnpackingAssignmentSequenceTarget extends UnpackingAssignmentTarget instanceof SequenceNode {
ControlFlowNode getElement(int i) { result = super.getElement(i) }
ControlFlowNode getAnElement() { result = this.getElement(_) }
}
/**
* Step 1a
* Data flows from `iterable` to `TIterableSequence(sequence)`
*/
predicate iterableUnpackingAssignmentFlowStep(Node nodeFrom, Node nodeTo) {
exists(AssignmentTarget target |
nodeFrom.asExpr() = target.getValue() and
nodeTo = TIterableSequenceNode(target)
)
}
/**
* Step 1b
* Data is read from `iterable` to `TIterableSequence(sequence)`
*/
predicate iterableUnpackingForReadStep(CfgNode nodeFrom, Content c, Node nodeTo) {
exists(ForTarget target |
nodeFrom.asExpr() = target.getSource() and
target instanceof SequenceNode and
nodeTo = TIterableSequenceNode(target)
) and
(
c instanceof ListElementContent
or
c instanceof SetElementContent
)
}
/**
* Step 2
* Data flows from `TIterableSequence(sequence)` to `sequence`
*/
predicate iterableUnpackingTupleFlowStep(Node nodeFrom, Node nodeTo) {
exists(UnpackingAssignmentSequenceTarget target |
nodeFrom = TIterableSequenceNode(target) and
nodeTo.asCfgNode() = target
)
}
/**
* Step 3
* Data flows from `TIterableSequence(sequence)` into `TIterableElement(sequence)`.
* As `sequence` is modeled as a tuple, we will not read tuple content as that would allow
* crosstalk.
*/
predicate iterableUnpackingConvertingReadStep(Node nodeFrom, Content c, Node nodeTo) {
exists(UnpackingAssignmentSequenceTarget target |
nodeFrom = TIterableSequenceNode(target) and
nodeTo = TIterableElementNode(target) and
(
c instanceof ListElementContent
or
c instanceof SetElementContent
// TODO: dict content in iterable unpacking not handled
)
)
}
/**
* Step 4
* Data flows from `TIterableElement(sequence)` to `sequence`.
* Content type is `TupleElementContent` with indices taken from the syntax.
* For instance, if `sequence` is `(a, *b, c)`, content is written to index 0, 1, and 2.
*/
predicate iterableUnpackingConvertingStoreStep(Node nodeFrom, Content c, Node nodeTo) {
exists(UnpackingAssignmentSequenceTarget target |
nodeFrom = TIterableElementNode(target) and
nodeTo.asCfgNode() = target and
exists(int index | exists(target.getElement(index)) |
c.(TupleElementContent).getIndex() = index
)
)
}
/**
* Step 5
* For a sequence node inside an iterable unpacking, data flows from the sequence to its elements. There are
* three cases for what `toNode` should be:
* a) If the element is a plain variable, `toNode` is the corresponding essa node.
*
* b) If the element is itself a sequence, with control-flow node `seq`, `toNode` is `TIterableSequence(seq)`.
*
* c) If the element is a starred variable, with control-flow node `v`, `toNode` is `TIterableElement(v)`.
*/
predicate iterableUnpackingElementReadStep(Node nodeFrom, Content c, Node nodeTo) {
exists(
UnpackingAssignmentSequenceTarget target, int index, ControlFlowNode element, int starIndex
|
target.getElement(starIndex) instanceof StarredNode
or
not exists(target.getAnElement().(StarredNode)) and
starIndex = -1
|
nodeFrom.asCfgNode() = target and
element = target.getElement(index) and
(
if starIndex = -1 or index < starIndex
then c.(TupleElementContent).getIndex() = index
else
// This could get big if big tuples exist
if index = starIndex
then c.(TupleElementContent).getIndex() >= index
else c.(TupleElementContent).getIndex() >= index - 1
) and
(
if element instanceof SequenceNode
then
// Step 5b
nodeTo = TIterableSequenceNode(element)
else
if element instanceof StarredNode
then
// Step 5c
nodeTo = TIterableElementNode(element)
else
// Step 5a
nodeTo.asVar().getDefinition().(MultiAssignmentDefinition).getDefiningNode() = element
)
)
}
/**
* Step 6
* Data flows from `TIterableElement(v)` to the essa variable for `v`, with
* content type `ListElementContent`.
*/
predicate iterableUnpackingStarredElementStoreStep(Node nodeFrom, Content c, Node nodeTo) {
exists(ControlFlowNode starred | starred.getNode() instanceof Starred |
nodeFrom = TIterableElementNode(starred) and
nodeTo.asVar().getDefinition().(MultiAssignmentDefinition).getDefiningNode() = starred and
c instanceof ListElementContent
)
}
/** All read steps associated with unpacking assignment. */
predicate iterableUnpackingReadStep(Node nodeFrom, Content c, Node nodeTo) {
iterableUnpackingForReadStep(nodeFrom, c, nodeTo)
or
iterableUnpackingElementReadStep(nodeFrom, c, nodeTo)
or
iterableUnpackingConvertingReadStep(nodeFrom, c, nodeTo)
}
/** All store steps associated with unpacking assignment. */
predicate iterableUnpackingStoreStep(Node nodeFrom, Content c, Node nodeTo) {
iterableUnpackingStarredElementStoreStep(nodeFrom, c, nodeTo)
or
iterableUnpackingConvertingStoreStep(nodeFrom, c, nodeTo)
}
/** All flow steps associated with unpacking assignment. */
predicate iterableUnpackingFlowStep(Node nodeFrom, Node nodeTo) {
iterableUnpackingAssignmentFlowStep(nodeFrom, nodeTo)
or
iterableUnpackingTupleFlowStep(nodeFrom, nodeTo)
}
}
import IterableUnpacking
/**
* There are a number of patterns available for the match statement.
* Each one transfers data and content differently to its parts.
*
* Furthermore, given a successful match, we can infer some data about
* the subject. Consider the example:
* ```python
* match choice:
* case 'Y':
* ...body
* ```
* Inside `body`, we know that `choice` has the value `'Y'`.
*
* A similar thing happens with the "as pattern". Consider the example:
* ```python
* match choice:
* case ('y'|'Y') as c:
* ...body
* ```
* By the binding rules, there is data flow from `choice` to `c`. But we
* can infer the value of `c` to be either `'y'` or `'Y'` if the match succeeds.
*
* We will treat such inferences separately as guards. First we will model the data flow
* stemming from the bindings and the matching of shape. Below, 'subject' is not necessarily the
* top-level subject of the match, but rather the part recursively matched by the current pattern.
* For instance, in the example:
* ```python
* match command:
* case ('quit' as c) | ('go', ('up'|'down') as c):
* ...body
* ```
* `command` is the subject of first the as-pattern, while the second component of `command`
* is the subject of the second as-pattern. As such, 'subject' refers to the pattern under evaluation.
*
* - as pattern: subject flows to alias as well as to the interior pattern
* - or pattern: subject flows to each alternative
* - literal pattern: flow from the literal to the pattern, to add information
* - capture pattern: subject flows to the variable
* - wildcard pattern: no flow
* - value pattern: flow from the value to the pattern, to add information
* - sequence pattern: each element reads from subject at the associated index
* - star pattern: subject flows to the variable, possibly via a conversion
* - mapping pattern: each value reads from subject at the associated key
* - double star pattern: subject flows to the variable, possibly via a conversion
* - key-value pattern: the value reads from the subject at the key (see mapping pattern)
* - class pattern: all keywords read the appropriate attribute from the subject
* - keyword pattern: the appropriate attribute is read from the subject (see class pattern)
*
* Inside the class pattern, we also find positional arguments. They are converted to
* keyword arguments using the `__match_args__` attribute on the class. We do not
* currently model this.
*/
module MatchUnpacking {
/**
* The subject of a match flows to each top-level pattern
* (a pattern directly under a `case` statement).
*
* We could consider a model closer to use-use-flow, where the subject
* only flows to the first top-level pattern and from there to the
* following ones.
*/
predicate matchSubjectFlowStep(Node nodeFrom, Node nodeTo) {
exists(MatchStmt match, Expr subject, Pattern target |
subject = match.getSubject() and
target = match.getCase(_).(Case).getPattern()
|
nodeFrom.asExpr() = subject and
nodeTo.asCfgNode().getNode() = target
)
}
/**
* as pattern: subject flows to alias as well as to the interior pattern
* syntax (toplevel): `case pattern as alias:`
*/
predicate matchAsFlowStep(Node nodeFrom, Node nodeTo) {
exists(MatchAsPattern subject, Name alias | alias = subject.getAlias() |
// We make the subject flow to the interior pattern via the alias.
// That way, information can propagate from the interior pattern to the alias.
//
// the subject flows to the interior pattern
nodeFrom.asCfgNode().getNode() = subject and
nodeTo.asCfgNode().getNode() = subject.getPattern()
or
// the interior pattern flows to the alias
nodeFrom.asCfgNode().getNode() = subject.getPattern() and
nodeTo.asVar().getDefinition().(PatternAliasDefinition).getDefiningNode().getNode() = alias
)
}
/**
* or pattern: subject flows to each alternative
* syntax (toplevel): `case alt1 | alt2:`
*/
predicate matchOrFlowStep(Node nodeFrom, Node nodeTo) {
exists(MatchOrPattern subject, Pattern pattern | pattern = subject.getAPattern() |
nodeFrom.asCfgNode().getNode() = subject and
nodeTo.asCfgNode().getNode() = pattern
)
}
/**
* literal pattern: flow from the literal to the pattern, to add information
* syntax (toplevel): `case literal:`
*/
predicate matchLiteralFlowStep(Node nodeFrom, Node nodeTo) {
exists(MatchLiteralPattern pattern, Expr literal | literal = pattern.getLiteral() |
nodeFrom.asExpr() = literal and
nodeTo.asCfgNode().getNode() = pattern
)
}
/**
* capture pattern: subject flows to the variable
* syntax (toplevel): `case var:`
*/
predicate matchCaptureFlowStep(Node nodeFrom, Node nodeTo) {
exists(MatchCapturePattern capture, Name var | capture.getVariable() = var |
nodeFrom.asCfgNode().getNode() = capture and
nodeTo.asVar().getDefinition().(PatternCaptureDefinition).getDefiningNode().getNode() = var
)
}
/**
* value pattern: flow from the value to the pattern, to add information
* syntax (toplevel): `case Dotted.value:`
*/
predicate matchValueFlowStep(Node nodeFrom, Node nodeTo) {
exists(MatchValuePattern pattern, Expr value | value = pattern.getValue() |
nodeFrom.asExpr() = value and
nodeTo.asCfgNode().getNode() = pattern
)
}
/**
* sequence pattern: each element reads from subject at the associated index
* syntax (toplevel): `case [a, b]:`
*/
predicate matchSequenceReadStep(Node nodeFrom, Content c, Node nodeTo) {
exists(MatchSequencePattern subject, int index, Pattern element |
element = subject.getPattern(index)
|
nodeFrom.asCfgNode().getNode() = subject and
nodeTo.asCfgNode().getNode() = element and
(
// tuple content
c.(TupleElementContent).getIndex() = index
or
// list content
c instanceof ListElementContent
// set content is excluded from sequence patterns,
// see https://www.python.org/dev/peps/pep-0635/#sequence-patterns
)
)
}
/**
* star pattern: subject flows to the variable, possibly via a conversion
* syntax (toplevel): `case *var:`
*
* We decompose this flow into a read step and a store step. The read step
* reads both tuple and list content, the store step only stores list content.
* This way, we convert all content to list content.
*
* This is the read step.
*/
predicate matchStarReadStep(Node nodeFrom, Content c, Node nodeTo) {
exists(MatchSequencePattern subject, int index, MatchStarPattern star |
star = subject.getPattern(index)
|
nodeFrom.asCfgNode().getNode() = subject and
nodeTo = TStarPatternElementNode(star) and
(
// tuple content
c.(TupleElementContent).getIndex() >= index
or
// list content
c instanceof ListElementContent
// set content is excluded from sequence patterns,
// see https://www.python.org/dev/peps/pep-0635/#sequence-patterns
)
)
}
/**
* star pattern: subject flows to the variable, possibly via a conversion
* syntax (toplevel): `case *var:`
*
* We decompose this flow into a read step and a store step. The read step
* reads both tuple and list content, the store step only stores list content.
* This way, we convert all content to list content.
*
* This is the store step.
*/
predicate matchStarStoreStep(Node nodeFrom, Content c, Node nodeTo) {
exists(MatchStarPattern star |
nodeFrom = TStarPatternElementNode(star) and
nodeTo.asCfgNode().getNode() = star.getTarget() and
c instanceof ListElementContent
)
}
/**
* mapping pattern: each value reads from subject at the associated key
* syntax (toplevel): `case {"color": c, "height": x}:`
*/
predicate matchMappingReadStep(Node nodeFrom, Content c, Node nodeTo) {
exists(
MatchMappingPattern subject, MatchKeyValuePattern keyValue, MatchLiteralPattern key,
Pattern value
|
keyValue = subject.getAMapping() and
key = keyValue.getKey() and
value = keyValue.getValue()
|
nodeFrom.asCfgNode().getNode() = subject and
nodeTo.asCfgNode().getNode() = value and
c.(DictionaryElementContent).getKey() = key.getLiteral().(StrConst).getText()
)
}
/**
* double star pattern: subject flows to the variable, possibly via a conversion
* syntax (toplevel): `case {**var}:`
*
* Dictionary content flows to the double star, but all mentioned keys in the
* mapping pattern should be cleared.
*/
predicate matchMappingFlowStep(Node nodeFrom, Node nodeTo) {
exists(MatchMappingPattern subject, MatchDoubleStarPattern dstar |
dstar = subject.getAMapping()
|
nodeFrom.asCfgNode().getNode() = subject and
nodeTo.asCfgNode().getNode() = dstar.getTarget()
)
}
/**
* Bindings that are mentioned in a mapping pattern will not be available
* to a double star pattern in the same mapping pattern.
*/
predicate matchMappingClearStep(Node n, Content c) {
exists(
MatchMappingPattern subject, MatchKeyValuePattern keyValue, MatchLiteralPattern key,
MatchDoubleStarPattern dstar
|
keyValue = subject.getAMapping() and
key = keyValue.getKey() and
dstar = subject.getAMapping()
|
n.asCfgNode().getNode() = dstar.getTarget() and
c.(DictionaryElementContent).getKey() = key.getLiteral().(StrConst).getText()
)
}
/**
* class pattern: all keywords read the appropriate attribute from the subject
* syntax (toplevel): `case ClassName(attr = val):`
*/
predicate matchClassReadStep(Node nodeFrom, Content c, Node nodeTo) {
exists(MatchClassPattern subject, MatchKeywordPattern keyword, Name attr, Pattern value |
keyword = subject.getKeyword(_) and
attr = keyword.getAttribute() and
value = keyword.getValue()
|
nodeFrom.asCfgNode().getNode() = subject and
nodeTo.asCfgNode().getNode() = value and
c.(AttributeContent).getAttribute() = attr.getId()
)
}
/** All flow steps associated with match. */
predicate matchFlowStep(Node nodeFrom, Node nodeTo) {
matchSubjectFlowStep(nodeFrom, nodeTo)
or
matchAsFlowStep(nodeFrom, nodeTo)
or
matchOrFlowStep(nodeFrom, nodeTo)
or
matchLiteralFlowStep(nodeFrom, nodeTo)
or
matchCaptureFlowStep(nodeFrom, nodeTo)
or
matchValueFlowStep(nodeFrom, nodeTo)
or
matchMappingFlowStep(nodeFrom, nodeTo)
}
/** All read steps associated with match. */
predicate matchReadStep(Node nodeFrom, Content c, Node nodeTo) {
matchClassReadStep(nodeFrom, c, nodeTo)
or
matchSequenceReadStep(nodeFrom, c, nodeTo)
or
matchMappingReadStep(nodeFrom, c, nodeTo)
or
matchStarReadStep(nodeFrom, c, nodeTo)
}
/** All store steps associated with match. */
predicate matchStoreStep(Node nodeFrom, Content c, Node nodeTo) {
matchStarStoreStep(nodeFrom, c, nodeTo)
}
/**
* All clear steps associated with match
*/
predicate matchClearStep(Node n, Content c) { matchMappingClearStep(n, c) }
}
import MatchUnpacking
/** Data flows from a sequence to a call to `pop` on the sequence. */
predicate popReadStep(CfgNode nodeFrom, Content c, CfgNode nodeTo) {
// set.pop or list.pop
// `s.pop()`
// nodeFrom is `s`, cfg node
// nodeTo is `s.pop()`, cfg node
// c denotes element of list or set
exists(CallNode call, AttrNode a |
call.getFunction() = a and
a.getName() = "pop" and // Should match appropriate call since we tracked a sequence here.
not exists(call.getAnArg()) and
nodeFrom.getNode() = a.getObject() and
nodeTo.getNode() = call and
(
c instanceof ListElementContent
or
c instanceof SetElementContent
)
)
or
// dict.pop
// `d.pop("key")`
// nodeFrom is `d`, cfg node
// nodeTo is `d.pop("key")`, cfg node
// c denotes the key `"key"`
exists(CallNode call, AttrNode a |
call.getFunction() = a and
a.getName() = "pop" and // Should match appropriate call since we tracked a dictionary here.
nodeFrom.getNode() = a.getObject() and
nodeTo.getNode() = call and
c.(DictionaryElementContent).getKey() = call.getArg(0).getNode().(StrConst).getS()
)
}
predicate forReadStep(CfgNode nodeFrom, Content c, Node nodeTo) {
exists(ForTarget target |
nodeFrom.asExpr() = target.getSource() and
nodeTo.asVar().(EssaNodeDefinition).getDefiningNode() = target
) and
(
c instanceof ListElementContent
or
c instanceof SetElementContent
or
c = small_tuple()
)
}
pragma[noinline]
TupleElementContent small_tuple() { result.getIndex() <= 7 }
/**
* Holds if `nodeTo` is a read of an attribute (corresponding to `c`) of the object in `nodeFrom`.
*
* For example, in
* ```python
* obj.foo
* ```
* data flows from `obj` to `obj.foo` via a read from `foo`.
*/
predicate attributeReadStep(CfgNode nodeFrom, AttributeContent c, CfgNode nodeTo) {
exists(AttrNode attr |
nodeFrom.asCfgNode() = attr.getObject() and
nodeTo.asCfgNode() = attr and
attr.getName() = c.getAttribute() and
attr.isLoad()
)
}
/**
* Holds if `nodeFrom` is a dictionary argument being unpacked and `nodeTo` is the
* synthezised unpacked argument with the name indicated by `c`.
*/
predicate kwUnpackReadStep(CfgNode nodeFrom, DictionaryElementContent c, Node nodeTo) {
exists(CallNode call, CallableValue callable, string name |
nodeFrom.asCfgNode() = call.getNode().getKwargs().getAFlowNode() and
nodeTo = TKwUnpackedNode(call, callable, name) and
name = c.getKey()
)
}
/**
* Clear content at key `name` of the synthesized dictionary `TKwOverflowNode(call, callable)`,
* whenever `call` unpacks `name`.
*/
predicate kwOverflowClearStep(Node n, Content c) {
exists(CallNode call, CallableValue callable, string name |
call_unpacks(call, _, callable, name, _) and
n = TKwOverflowNode(call, callable) and
c.(DictionaryElementContent).getKey() = name
)
}
/**
* Holds if values stored inside content `c` are cleared at node `n`. For example,
* any value stored inside `f` is cleared at the pre-update node associated with `x`
* in `x.f = newValue`.
*/
predicate clearsContent(Node n, Content c) {
kwOverflowClearStep(n, c)
or
matchClearStep(n, c)
or
attributeClearStep(n, c)
}
/**
* Holds if values stored inside attribute `c` are cleared at node `n`.
*
* In `obj.foo = x` the any old value stored in `foo` is cleared at the pre-update node
* associated with `obj`
*/
predicate attributeClearStep(Node n, AttributeContent c) {
exists(PostUpdateNode post | post.getPreUpdateNode() = n | attributeStoreStep(_, c, post))
}
//--------
// Fancy context-sensitive guards
//--------
/**
* Holds if the node `n` is unreachable when the call context is `call`.
*/
predicate isUnreachableInCall(Node n, DataFlowCall call) { none() }
//--------
// Virtual dispatch with call context
//--------
/**
* Gets a viable dispatch target of `call` in the context `ctx`. This is
* restricted to those `call`s for which a context might make a difference.
*/
DataFlowCallable viableImplInCallContext(DataFlowCall call, DataFlowCall ctx) { none() }
/**
* Holds if the set of viable implementations that can be called by `call`
* might be improved by knowing the call context. This is the case if the qualifier accesses a parameter of
* the enclosing callable `c` (including the implicit `this` parameter).
*/
predicate mayBenefitFromCallContext(DataFlowCall call, DataFlowCallable c) { none() }
int accessPathLimit() { result = 5 }
/**
* Holds if access paths with `c` at their head always should be tracked at high
* precision. This disables adaptive access path precision for such access paths.
*/
predicate forceHighPrecision(Content c) { none() }
/** Holds if `n` should be hidden from path explanations. */
predicate nodeIsHidden(Node n) { none() }
class LambdaCallKind = Unit;
/** Holds if `creation` is an expression that creates a lambda of kind `kind` for `c`. */
predicate lambdaCreation(Node creation, LambdaCallKind kind, DataFlowCallable c) { none() }
/** Holds if `call` is a lambda call of kind `kind` where `receiver` is the lambda expression. */
predicate lambdaCall(DataFlowCall call, LambdaCallKind kind, Node receiver) { none() }
/** Extra data-flow steps needed for lambda flow analysis. */
predicate additionalLambdaFlowStep(Node nodeFrom, Node nodeTo, boolean preservesValue) { none() }
/**
* Holds if flow is allowed to pass from parameter `p` and back to itself as a
* side-effect, resulting in a summary from `p` to itself.
*
* One example would be to allow flow like `p.foo = p.bar;`, which is disallowed
* by default as a heuristic.
*/
predicate allowParameterReturnInSelf(ParameterNode p) { none() }