diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll index 1b9e2ded82f..ffca8390d86 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll @@ -199,6 +199,8 @@ abstract class DataFlowFunction extends DataFlowCallable, TFunction { exists(string name | ppos.isKeyword(name) | result.getParameter() = func.getArgByName(name)) or ppos.isDictSplat() and result.getParameter() = func.getKwarg() + or + ppos.isDictSplat() and result = TSynthDictSplatParameterNode(this) } } @@ -1194,7 +1196,9 @@ abstract class ParameterNodeImpl extends Node { * Holds if this node is the parameter of callable `c` at the * position `ppos`. */ - abstract predicate isParameterOf(DataFlowCallable c, ParameterPosition ppos); + predicate isParameterOf(DataFlowCallable c, ParameterPosition ppos) { + this = c.getParameter(ppos) + } } /** A parameter for a library callable with a flow summary. */ diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll index d6abd40e721..7b2a1d81580 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll @@ -39,6 +39,9 @@ predicate isArgumentNode(ArgumentNode arg, DataFlowCall c, ArgumentPosition pos) //-------- predicate isExpressionNode(ControlFlowNode node) { node.getNode() instanceof Expr } +// ============================================================================= +// SyntheticPreUpdateNode +// ============================================================================= class SyntheticPreUpdateNode extends Node, TSyntheticPreUpdateNode { CallNode node; @@ -54,6 +57,9 @@ class SyntheticPreUpdateNode extends Node, TSyntheticPreUpdateNode { override Location getLocation() { result = node.getLocation() } } +// ============================================================================= +// **kwargs (DictSplat) related +// ============================================================================= /** * A (synthetic) data-flow node that represents all keyword arguments, as if they had * been passed in a `**kwargs` argument. @@ -98,11 +104,70 @@ private predicate dictSplatParameterNodeClearStep(ParameterNode n, DictionaryEle exists(DataFlowCallable callable, ParameterPosition dictSplatPos, ParameterPosition keywordPos | dictSplatPos.isDictSplat() and n = callable.getParameter(dictSplatPos) and + not n instanceof SynthDictSplatParameterNode and exists(callable.getParameter(keywordPos)) and keywordPos.isKeyword(c.getKey()) ) } +/** + * A synthetic data-flow node to allow flow to keyword parameters from a `**kwargs` argument. + * + * Take the code snippet below as an example. Since the call only has a `**kwargs` argument, + * with a `**` argument position, we add this synthetic parameter node with `**` parameter position, + * and a read step to the `p1` parameter. + * + * ```py + * def foo(p1): ... + * + * kwargs = {"p1": 42} + * foo(**kwargs) + * ``` + * + * + * Note that this will introduce a bit of redundancy in cases like + * + * ```py + * foo(p1=taint(1), p2=taint(2)) + * ``` + * + * where direct keyword matching is possible, since we construct a synthesized dict + * splat argument (`SynthDictSplatArgumentNode`) at the call site, which means that + * `taint(1)` will flow into `p1` both via normal keyword matching and via the synthesized + * nodes (and similarly for `p2`). However, this redundancy is OK since + * (a) it means that type-tracking through keyword arguments also works in most cases, + * (b) read/store steps can be avoided when direct keyword matching is possible, and + * hence access path limits are not a concern, and + * (c) since the synthesized nodes are hidden, the reported data-flow paths will be + * collapsed anyway. + */ +class SynthDictSplatParameterNode extends ParameterNodeImpl, TSynthDictSplatParameterNode { + DataFlowCallable callable; + + SynthDictSplatParameterNode() { this = TSynthDictSplatParameterNode(callable) } + + override string toString() { result = "SynthDictSplatParameterNode" } + + override Scope getScope() { result = callable.getScope() } + + override Location getLocation() { result = callable.getLocation() } + + override Parameter getParameter() { none() } +} + +predicate synthDictSplatParameterNodeReadStep( + SynthDictSplatParameterNode nodeFrom, DictionaryElementContent c, ParameterNode nodeTo +) { + exists(DataFlowCallable callable, ParameterPosition ppos | + nodeFrom = TSynthDictSplatParameterNode(callable) and + nodeTo = callable.getParameter(ppos) and + ppos.isKeyword(c.getKey()) + ) +} + +// ============================================================================= +// PostUpdateNode +// ============================================================================= abstract class PostUpdateNodeImpl extends Node { /** Gets the node before the state update. */ abstract Node getPreUpdateNode(); @@ -624,6 +689,8 @@ predicate readStep(Node nodeFrom, Content c, Node nodeTo) { attributeReadStep(nodeFrom, c, nodeTo) or FlowSummaryImpl::Private::Steps::summaryReadStep(nodeFrom, c, nodeTo) + or + synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo) } /** Data flows from a sequence to a subscript of the sequence. */ @@ -783,6 +850,8 @@ predicate nodeIsHidden(Node n) { n instanceof SummaryParameterNode or n instanceof SynthDictSplatArgumentNode + or + n instanceof SynthDictSplatParameterNode } class LambdaCallKind = Unit; diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll index 761202ecb94..94d7bb70543 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll @@ -64,7 +64,7 @@ newtype TNode = exists(Class cls, Function func, ParameterDefinition def | func = cls.getAMethod() and not hasStaticmethodDecorator(func) and - // this matches what we do in ParameterNode + // this matches what we do in ExtractedParameterNode def.getDefiningNode() = node and def.getParameter() = func.getArg(0) ) @@ -113,9 +113,12 @@ newtype TNode = TSummaryParameterNode(FlowSummaryImpl::Public::SummarizedCallable c, ParameterPosition pos) { FlowSummaryImpl::Private::summaryParameterNodeRange(c, pos) } or - TSynthDictSplatArgumentNode(CallNode call) { exists(call.getArgByName(_)) } - -class TParameterNode = TCfgNode or TSummaryParameterNode; + /** A synthetic node to capture keyword arguments that are passed to a `**kwargs` parameter. */ + TSynthDictSplatArgumentNode(CallNode call) { exists(call.getArgByName(_)) } or + /** A synthetic node to allow flow to keyword parameters from a `**kwargs` argument. */ + TSynthDictSplatParameterNode(DataFlowCallable callable) { + exists(ParameterPosition ppos | ppos.isKeyword(_) | exists(callable.getParameter(ppos))) + } /** Helper for `Node::getEnclosingCallable`. */ private DataFlowCallable getCallableScope(Scope s) { @@ -292,7 +295,7 @@ ExprNode exprNode(DataFlowExpr e) { result.getNode().getNode() = e } * The value of a parameter at function entry, viewed as a node in a data * flow graph. */ -class ParameterNode extends Node, TParameterNode instanceof ParameterNodeImpl { +class ParameterNode extends Node instanceof ParameterNodeImpl { /** Gets the parameter corresponding to this node, if any. */ final Parameter getParameter() { result = super.getParameter() } } @@ -304,10 +307,6 @@ class ExtractedParameterNode extends ParameterNodeImpl, CfgNode { ExtractedParameterNode() { node = def.getDefiningNode() } - override predicate isParameterOf(DataFlowCallable c, ParameterPosition ppos) { - this = c.getParameter(ppos) - } - override Parameter getParameter() { result = def.getParameter() } } diff --git a/python/ql/test/experimental/dataflow/TestUtil/DataFlowConsistency.qll b/python/ql/test/experimental/dataflow/TestUtil/DataFlowConsistency.qll index b11c3ecd838..442c5fff770 100644 --- a/python/ql/test/experimental/dataflow/TestUtil/DataFlowConsistency.qll +++ b/python/ql/test/experimental/dataflow/TestUtil/DataFlowConsistency.qll @@ -8,4 +8,12 @@ private class MyConsistencyConfiguration extends ConsistencyConfiguration { override predicate argHasPostUpdateExclude(ArgumentNode n) { exists(ArgumentPosition apos | n.argumentOf(_, apos) and apos.isDictSplat()) } + + override predicate reverseReadExclude(Node n) { + // since `self`/`cls` parameters can be marked as implicit argument to `super()`, + // they will have PostUpdateNodes. We have a read-step from the synthetic `**kwargs` + // parameter, but dataflow-consistency queries should _not_ complain about there not + // being a post-update node for the synthetic `**kwargs` parameter. + n instanceof SynthDictSplatParameterNode + } } diff --git a/python/ql/test/experimental/dataflow/basic/callGraphSinks.expected b/python/ql/test/experimental/dataflow/basic/callGraphSinks.expected index 17f3028ae23..e4b8f905530 100644 --- a/python/ql/test/experimental/dataflow/basic/callGraphSinks.expected +++ b/python/ql/test/experimental/dataflow/basic/callGraphSinks.expected @@ -1,2 +1,3 @@ +| test.py:1:1:1:21 | SynthDictSplatParameterNode | | test.py:1:19:1:19 | ControlFlowNode for x | | test.py:7:5:7:20 | ControlFlowNode for obfuscated_id() | diff --git a/python/ql/test/experimental/dataflow/basic/local.expected b/python/ql/test/experimental/dataflow/basic/local.expected index 133f740596c..cdf40018ed0 100644 --- a/python/ql/test/experimental/dataflow/basic/local.expected +++ b/python/ql/test/experimental/dataflow/basic/local.expected @@ -5,6 +5,7 @@ | test.py:1:1:1:21 | ControlFlowNode for FunctionExpr | test.py:1:1:1:21 | ControlFlowNode for FunctionExpr | | test.py:1:1:1:21 | ControlFlowNode for FunctionExpr | test.py:1:5:1:17 | GSSA Variable obfuscated_id | | test.py:1:1:1:21 | ControlFlowNode for FunctionExpr | test.py:7:5:7:17 | ControlFlowNode for obfuscated_id | +| test.py:1:1:1:21 | SynthDictSplatParameterNode | test.py:1:1:1:21 | SynthDictSplatParameterNode | | test.py:1:5:1:17 | ControlFlowNode for obfuscated_id | test.py:1:5:1:17 | ControlFlowNode for obfuscated_id | | test.py:1:5:1:17 | GSSA Variable obfuscated_id | test.py:1:5:1:17 | GSSA Variable obfuscated_id | | test.py:1:5:1:17 | GSSA Variable obfuscated_id | test.py:7:5:7:17 | ControlFlowNode for obfuscated_id | diff --git a/python/ql/test/experimental/dataflow/basic/sinks.expected b/python/ql/test/experimental/dataflow/basic/sinks.expected index cfd8effd77b..944f8190aa5 100644 --- a/python/ql/test/experimental/dataflow/basic/sinks.expected +++ b/python/ql/test/experimental/dataflow/basic/sinks.expected @@ -3,6 +3,7 @@ | test.py:0:0:0:0 | GSSA Variable b | | test.py:0:0:0:0 | SSA variable $ | | test.py:1:1:1:21 | ControlFlowNode for FunctionExpr | +| test.py:1:1:1:21 | SynthDictSplatParameterNode | | test.py:1:5:1:17 | ControlFlowNode for obfuscated_id | | test.py:1:5:1:17 | GSSA Variable obfuscated_id | | test.py:1:19:1:19 | ControlFlowNode for x | diff --git a/python/ql/test/experimental/dataflow/basic/sources.expected b/python/ql/test/experimental/dataflow/basic/sources.expected index cfd8effd77b..944f8190aa5 100644 --- a/python/ql/test/experimental/dataflow/basic/sources.expected +++ b/python/ql/test/experimental/dataflow/basic/sources.expected @@ -3,6 +3,7 @@ | test.py:0:0:0:0 | GSSA Variable b | | test.py:0:0:0:0 | SSA variable $ | | test.py:1:1:1:21 | ControlFlowNode for FunctionExpr | +| test.py:1:1:1:21 | SynthDictSplatParameterNode | | test.py:1:5:1:17 | ControlFlowNode for obfuscated_id | | test.py:1:5:1:17 | GSSA Variable obfuscated_id | | test.py:1:19:1:19 | ControlFlowNode for x | diff --git a/python/ql/test/experimental/dataflow/coverage/argumentPassing.py b/python/ql/test/experimental/dataflow/coverage/argumentPassing.py index 4d87e750572..a3e4752ffd2 100644 --- a/python/ql/test/experimental/dataflow/coverage/argumentPassing.py +++ b/python/ql/test/experimental/dataflow/coverage/argumentPassing.py @@ -72,7 +72,7 @@ def argument_passing( @expects(7) def test_argument_passing1(): - argument_passing(arg1, *(arg2, arg3, arg4), e=arg5, **{"f": arg6, "g": arg7}) #$ arg1 arg5 arg7 func=argument_passing MISSING: arg2 arg3 arg4 arg6 + argument_passing(arg1, *(arg2, arg3, arg4), e=arg5, **{"f": arg6, "g": arg7}) #$ arg1 arg5 arg6 arg7 func=argument_passing MISSING: arg2 arg3 arg4 @expects(7) @@ -102,8 +102,8 @@ def with_multiple_kw_args(a, b, c): def test_multiple_kw_args(): with_multiple_kw_args(b=arg2, c=arg3, a=arg1) #$ arg1 arg2 arg3 with_multiple_kw_args(arg1, *(arg2,), arg3) #$ arg1 MISSING: arg2 arg3 - with_multiple_kw_args(arg1, **{"c": arg3}, b=arg2) #$ arg1 arg2 MISSING: arg3 - with_multiple_kw_args(**{"b": arg2}, **{"c": arg3}, **{"a": arg1}) #$ MISSING: arg1 arg2 arg3 + with_multiple_kw_args(arg1, **{"c": arg3}, b=arg2) #$ arg1 arg2 arg3 func=with_multiple_kw_args + with_multiple_kw_args(**{"b": arg2}, **{"c": arg3}, **{"a": arg1}) #$ arg1 arg2 arg3 func=with_multiple_kw_args def with_default_arguments(a=arg1, b=arg2, c=arg3): #$ arg1 arg2 arg3 func=with_default_arguments @@ -117,7 +117,7 @@ def test_default_arguments(): with_default_arguments() with_default_arguments(arg1) #$ arg1 with_default_arguments(b=arg2) #$ arg2 - with_default_arguments(**{"c": arg3}) #$ MISSING: arg3 + with_default_arguments(**{"c": arg3}) #$ arg3 func=with_default_arguments # All combinations @@ -198,5 +198,5 @@ def test_mixed(): args = {"b": arg2, "c": "safe"} # $ arg2 func=mixed mixed(a=arg1, **args) # $ arg1 - args = {"a": arg1, "b": arg2, "c": "safe"} # $ arg2 func=mixed MISSING: arg1 + args = {"a": arg1, "b": arg2, "c": "safe"} # $ arg1 arg2 func=mixed mixed(**args) diff --git a/python/ql/test/experimental/dataflow/coverage/test.py b/python/ql/test/experimental/dataflow/coverage/test.py index f4aadf433b2..1a7b2cbb6fa 100644 --- a/python/ql/test/experimental/dataflow/coverage/test.py +++ b/python/ql/test/experimental/dataflow/coverage/test.py @@ -393,7 +393,7 @@ def test_call_unpack_iterable(): def test_call_unpack_mapping(): - SINK(second(NONSOURCE, **{"b": SOURCE})) #$ MISSING: flow="SOURCE -> second(..)" + SINK(second(NONSOURCE, **{"b": SOURCE})) #$ flow="SOURCE -> second(..)" def f_extra_pos(a, *b): @@ -509,7 +509,7 @@ def test_lambda_unpack_mapping(): def second(a, b): return b - SINK(second(NONSOURCE, **{"b": SOURCE})) #$ MISSING: flow="SOURCE -> second(..)" + SINK(second(NONSOURCE, **{"b": SOURCE})) #$ flow="SOURCE -> second(..)" def test_lambda_extra_pos():