diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll index f5b628f7e45..3e6f82284c4 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll @@ -49,6 +49,7 @@ newtype TParameterPosition = pos = 0 or pos = any(Parameter p).getPosition() + 1 } or + TSynthStarArgsElementParameterPosition(int pos) { exists(TStarArgsParameterPosition(pos)) } or TDictSplatParameterPosition() /** A parameter position. */ @@ -65,6 +66,15 @@ class ParameterPosition extends TParameterPosition { /** Holds if this position represents a `*args` parameter at (0-based) `index`. */ predicate isStarArgs(int index) { this = TStarArgsParameterPosition(index) } + /** + * Holds if this position represents a synthetic parameter at or after (0-based) + * position `index`, from which there will be made a store step to the real + * `*args` parameter. + */ + predicate isSynthStarArgsElement(int index) { + this = TSynthStarArgsElementParameterPosition(index) + } + /** Holds if this position represents a `**kwargs` parameter. */ predicate isDictSplat() { this = TDictSplatParameterPosition() } @@ -78,6 +88,11 @@ class ParameterPosition extends TParameterPosition { or exists(int index | this.isStarArgs(index) and result = "*args at " + index) or + exists(int index | + this.isSynthStarArgsElement(index) and + result = "synthetic *args element at (or after) " + index + ) + or this.isDictSplat() and result = "**" } } @@ -132,6 +147,10 @@ predicate parameterMatch(ParameterPosition ppos, ArgumentPosition apos) { or exists(int index | ppos.isStarArgs(index) and apos.isStarArgs(index)) or + exists(int paramIndex, int argIndex | argIndex >= paramIndex | + ppos.isSynthStarArgsElement(paramIndex) and apos.isPositional(argIndex) + ) + or ppos.isDictSplat() and apos.isDictSplat() } @@ -219,8 +238,13 @@ abstract class DataFlowFunction extends DataFlowCallable, TFunction { exists(string name | ppos.isKeyword(name) | result.getParameter() = func.getArgByName(name)) or exists(int index | - ppos.isStarArgs(index) and - result.getParameter() = func.getVararg() + ( + ppos.isStarArgs(index) and + result.getParameter() = func.getVararg() + or + ppos.isSynthStarArgsElement(index) and + result = TSynthStarArgsElementParameterNode(this) + ) | // a `*args` parameter comes after the last positional parameter. We need to take // self parameter into account, so for diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll index 7b2a1d81580..136bdc85f8d 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll @@ -57,6 +57,78 @@ class SyntheticPreUpdateNode extends Node, TSyntheticPreUpdateNode { override Location getLocation() { result = node.getLocation() } } +// ============================================================================= +// *args (StarArgs) related +// ============================================================================= +/** + * A (synthetic) data-flow parameter node to capture all positional arguments that + * should be passed to the `*args` parameter. + * + * To handle + * ```py + * def func(*args): + * for arg in args: + * sink(arg) + * + * func(source1, source2, ...) + * ``` + * + * we add a synthetic parameter to `func` that accepts any positional argument at (or + * after) the index for the `*args` parameter. We add a store step (at any list index) to the real + * `*args` parameter. This means we can handle the code above, but if the code had done `sink(args[0])` + * we would (wrongly) add flow for `source2` as well. + * + * To solve this more precisely, we could add a synthetic argument with position `*args` + * that had store steps with the correct index (like we do for mapping keyword arguments to a + * `**kwargs` parameter). However, if a single call could go to 2 different + * targets with `*args` parameters at different positions, as in the example below, it's unclear what + * index to store `2` at. For the `foo` callable it should be 1, for the `bar` callable it should be 0. + * So this information would need to be encoded in the arguments of a `ArgumentPosition` branch, and + * one of the arguments would be which callable is the target. However, we cannot build `ArgumentPosition` + * branches based on the call-graph, so this strategy doesn't work. + * + * Another approach to solving it precisely is to add multiple synthetic parameters that have store steps + * to the real `*args` parameter. So for the example below, `foo` would need to have synthetic parameter + * nodes for indexes 1 and 2 (which would have store step for index 0 and 1 of the `*args` parameter), + * and `bar` would need it for indexes 1, 2, and 3. The question becomes how many synthetic parameters to + * create, which _must_ be `max(Call call, int i | exists(call.getArg(i)))`, since (again) we can't base + * this on the call-graph. And each function with a `*args` parameter would need this many extra synthetic + * nodes. My gut feeling at that this simple approach will be good enough, but if we need to get it more + * precise, it should be possible to do it like this. + * + * ```py + * def foo(one, *args): ... + * def bar(*args): ... + * + * func = foo if else bar + * func(1, 2, 3) + */ +class SynthStarArgsElementParameterNode extends ParameterNodeImpl, + TSynthStarArgsElementParameterNode { + DataFlowCallable callable; + + SynthStarArgsElementParameterNode() { this = TSynthStarArgsElementParameterNode(callable) } + + override string toString() { result = "SynthStarArgsElementParameterNode" } + + override Scope getScope() { result = callable.getScope() } + + override Location getLocation() { result = callable.getLocation() } + + override Parameter getParameter() { none() } +} + +predicate synthStarArgsElementParameterNodeStoreStep( + SynthStarArgsElementParameterNode nodeFrom, ListElementContent c, ParameterNode nodeTo +) { + c = c and // suppress warning about unused parameter + exists(DataFlowCallable callable, ParameterPosition ppos | + nodeFrom = TSynthStarArgsElementParameterNode(callable) and + nodeTo = callable.getParameter(ppos) and + ppos.isStarArgs(_) + ) +} + // ============================================================================= // **kwargs (DictSplat) related // ============================================================================= @@ -519,6 +591,8 @@ predicate storeStep(Node nodeFrom, Content c, Node nodeTo) { or FlowSummaryImpl::Private::Steps::summaryStoreStep(nodeFrom, c, nodeTo) or + synthStarArgsElementParameterNodeStoreStep(nodeFrom, c, nodeTo) + or synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo) } @@ -849,6 +923,8 @@ predicate nodeIsHidden(Node n) { or n instanceof SummaryParameterNode or + n instanceof SynthStarArgsElementParameterNode + or n instanceof SynthDictSplatArgumentNode or n instanceof SynthDictSplatParameterNode diff --git a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll index 94d7bb70543..6d6113bc5af 100644 --- a/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll +++ b/python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll @@ -113,6 +113,10 @@ newtype TNode = TSummaryParameterNode(FlowSummaryImpl::Public::SummarizedCallable c, ParameterPosition pos) { FlowSummaryImpl::Private::summaryParameterNodeRange(c, pos) } or + /** A synthetic node to capture positional arguments that are passed to a `*args` parameter. */ + TSynthStarArgsElementParameterNode(DataFlowCallable callable) { + exists(ParameterPosition ppos | ppos.isStarArgs(_) | exists(callable.getParameter(ppos))) + } or /** A synthetic node to capture keyword arguments that are passed to a `**kwargs` parameter. */ TSynthDictSplatArgumentNode(CallNode call) { exists(call.getArgByName(_)) } or /** A synthetic node to allow flow to keyword parameters from a `**kwargs` argument. */ diff --git a/python/ql/test/experimental/dataflow/coverage/argumentPassing.py b/python/ql/test/experimental/dataflow/coverage/argumentPassing.py index 4e55bc48cd7..d9191c4bd80 100644 --- a/python/ql/test/experimental/dataflow/coverage/argumentPassing.py +++ b/python/ql/test/experimental/dataflow/coverage/argumentPassing.py @@ -140,7 +140,7 @@ def test_pos_star(): if len(a) > 0: SINK1(a[0]) - with_star(arg1) #$ MISSING: arg1 func=test_pos_star.with_star + with_star(arg1) #$ arg1 func=test_pos_star.with_star def test_pos_kw(): @@ -209,10 +209,10 @@ def starargs_only(*args): @expects(3*3) def test_only_starargs(): - starargs_only(arg1, arg2, "safe") # $ MISSING: arg1 arg2 + starargs_only(arg1, arg2, "safe") # $ arg1 arg2 SPURIOUS: bad2,bad3="arg1" bad1,bad3="arg2" args = (arg2, "safe") - starargs_only(arg1, *args) # $ MISSING: arg1 arg2 + starargs_only(arg1, *args) # $ arg1 SPURIOUS: bad2,bad3="arg1" MISSING: arg2 args = (arg1, arg2, "safe") # $ arg1 arg2 func=starargs_only starargs_only(*args) @@ -225,7 +225,7 @@ def starargs_mixed(a, *args): @expects(3*8) def test_stararg_mixed(): - starargs_mixed(arg1, arg2, "safe") # $ arg1 MISSING: arg2 + starargs_mixed(arg1, arg2, "safe") # $ arg1 arg2 SPURIOUS: bad3="arg2" args = (arg2, "safe") # $ arg2 func=starargs_mixed starargs_mixed(arg1, *args) # $ arg1 @@ -240,11 +240,11 @@ def test_stararg_mixed(): empty_args = () # adding first/last - starargs_mixed(arg1, arg2, "safe", *empty_args) # $ arg1 MISSING: arg2 + starargs_mixed(arg1, arg2, "safe", *empty_args) # $ arg1 arg2 SPURIOUS: bad3="arg2" starargs_mixed(*empty_args, arg1, arg2, "safe") # $ MISSING: arg1 arg2 # adding before/after *args - args = (arg2, "safe") - starargs_mixed(arg1, *args, *empty_args) # $ arg1 MISSING: arg2 + args = (arg2, "safe") # $ arg2 func=starargs_mixed + starargs_mixed(arg1, *args, *empty_args) # $ arg1 args = (arg2, "safe") starargs_mixed(arg1, *empty_args, *args) # $ arg1 MISSING: arg2 diff --git a/python/ql/test/experimental/dataflow/coverage/test.py b/python/ql/test/experimental/dataflow/coverage/test.py index 1a7b2cbb6fa..65f915cfd9b 100644 --- a/python/ql/test/experimental/dataflow/coverage/test.py +++ b/python/ql/test/experimental/dataflow/coverage/test.py @@ -401,7 +401,7 @@ def f_extra_pos(a, *b): def test_call_extra_pos(): - SINK(f_extra_pos(NONSOURCE, SOURCE)) #$ MISSING: flow="SOURCE -> f_extra_pos(..)" + SINK(f_extra_pos(NONSOURCE, SOURCE)) #$ flow="SOURCE -> f_extra_pos(..)" def f_extra_keyword(a, **b): @@ -514,7 +514,7 @@ def test_lambda_unpack_mapping(): def test_lambda_extra_pos(): f_extra_pos = lambda a, *b: b[0] - SINK(f_extra_pos(NONSOURCE, SOURCE)) #$ MISSING: flow="SOURCE -> f_extra_pos(..)" + SINK(f_extra_pos(NONSOURCE, SOURCE)) #$ flow="SOURCE -> f_extra_pos(..)" def test_lambda_extra_keyword(): @@ -689,7 +689,7 @@ def test_iterable_star_unpacking_in_for_2(): def iterate_star_args(first, second, *args): for arg in args: - SINK(arg) #$ MISSING: flow="SOURCE, l:+5 -> arg" flow="SOURCE, l:+6 -> arg" + SINK(arg) #$ flow="SOURCE, l:+5 -> arg" flow="SOURCE, l:+6 -> arg" # FP reported here: https://github.com/github/codeql-python-team/issues/49 @expects(2)