Files
codeql/python/ql/consistency-queries/DataFlowConsistency.ql
yoff ba0f24f9f4 Python: canonicalize CFG nodes for dataflow
The shared CFG creates multiple ControlFlowNodes per AST node in
conditional contexts (e.g. afterTrue/afterFalse for boolean conditions,
empty/non-empty for for-loops, matched/unmatched for match cases).
These splits matter for control-flow analysis, but for dataflow — where
we ask 'what is the value of this expression?' — we need exactly one
representative per AST or we double-count calls, arguments, and store
steps.

This adds Cfg::isCanonicalAstNodeRepresentative as a purely structural
pick: for split ASTs it selects the 'positive' outcome variant; for
non-split ASTs it selects the unique variant. The picker is implemented
via genuine-outcome helpers that work around the shared CFG's
cross-kind isAfterValue fallback (ControlFlowGraph.qll:870-892), see
the doc on isGenuineAfterTrue for details.

The TCfgNode-family newtypes in DataFlowPublic, TNormalCall and
TPotentialLibraryCall in DataFlowDispatch, and the SSA-projected
use-use/def-use steps in DataFlowPrivate are all routed through the
canonical filter. DataFlowConsistency and the test UnresolvedCalls
helper qualify their CallNode casts with Cfg:: to keep working.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-28 21:09:49 +00:00

155 lines
6.8 KiB
Plaintext

/**
* Provides consistency queries for checking invariants in the language-specific
* data-flow classes and predicates.
*/
private import python
import semmle.python.dataflow.new.DataFlow::DataFlow
private import semmle.python.dataflow.new.internal.DataFlowImplSpecific
private import semmle.python.dataflow.new.internal.DataFlowDispatch
private import semmle.python.dataflow.new.internal.TaintTrackingImplSpecific
private import codeql.dataflow.internal.DataFlowImplConsistency
private import semmle.python.controlflow.internal.Cfg as Cfg
private module Input implements InputSig<Location, PythonDataFlow> {
private import Private
private import Public
predicate postWithInFlowExclude(Node n) { n instanceof FlowSummaryNode }
predicate uniqueNodeLocationExclude(Node n) { n instanceof FlowSummaryNode }
predicate missingLocationExclude(Node n) { n instanceof FlowSummaryNode }
predicate argHasPostUpdateExclude(ArgumentNode n) {
// TODO: Implement post-updates for *args, see tests added in https://github.com/github/codeql/pull/14936
exists(ArgumentPosition apos | n.argumentOf(_, apos) and apos.isStarArgs(_))
or
// TODO: Implement post-updates for **kwargs, see tests added in https://github.com/github/codeql/pull/14936
exists(ArgumentPosition apos | n.argumentOf(_, apos) and apos.isDictSplat())
or
missingArgumentCallExclude(n)
}
predicate reverseReadExclude(Node n) {
// since `self`/`cls` parameters can be marked as implicit argument to `super()`,
// they will have PostUpdateNodes. We have a read-step from the synthetic `**kwargs`
// parameter, but dataflow-consistency queries should _not_ complain about there not
// being a post-update node for the synthetic `**kwargs` parameter.
n instanceof SynthDictSplatParameterNode
}
predicate uniqueParameterNodePositionExclude(DataFlowCallable c, ParameterPosition pos, Node p) {
// For normal parameters that can both be passed as positional arguments or keyword
// arguments, we currently have parameter positions for both cases..
//
// TODO: Figure out how bad breaking this consistency check is
exists(Function func, Parameter param |
c.getScope() = func and
p = parameterNode(param) and
c.getParameter(pos) = p and
param = func.getArg(_) and
param = func.getArgByName(_)
)
}
predicate uniqueEnclosingCallableExclude(Node n) {
// We only have a selection of valid callables.
// For instance, we do not have classes as `DataFlowCallable`s.
not n.(SynthCaptureNode).getSynthesizedCaptureNode().getEnclosingCallable() instanceof Function and
not n.(SynthCaptureNode).getSynthesizedCaptureNode().getEnclosingCallable() instanceof Module
}
predicate uniqueCallEnclosingCallableExclude(DataFlowCall call) {
not exists(call.getLocation().getFile().getRelativePath())
}
predicate identityLocalStepExclude(Node n) {
not exists(n.getLocation().getFile().getRelativePath())
}
predicate multipleArgumentCallExclude(ArgumentNode arg, DataFlowCall call) {
// since we can have multiple DataFlowCall for a CallNode (for example if it can
// resolve to multiple functions), but we only make _one_ ArgumentNode for each
// argument in the CallNode, we end up violating this consistency check in those
// cases. (see `getCallArg` in DataFlowDispatch.qll)
exists(DataFlowCall other, Cfg::CallNode cfgCall | other != call |
call.getNode() = cfgCall and
other.getNode() = cfgCall and
isArgumentNode(arg, call, _) and
isArgumentNode(arg, other, _)
)
or
// bound methods that refer to the same self argument.
// Example: In `bm = self.foo; bm(); bm()` both bm() calls use the same `self` as
// the (pos self) argument
exists(AttrRead attr, DataFlowCall other | other != call |
// for simple cases we can track the function back to the attr read but when the
// call appears in the body of a list-comprehension, we can't do that, and simply
// allow it instead.
(
call.getScope() = attr.getScope() and
any(CfgNode n | n.asCfgNode() = call.getNode().(Cfg::CallNode).getFunction())
.getALocalSource() = attr
or
not exists(call.getScope().(Function).getDefinition()) and
call.getScope().getScope+() = attr.getScope()
) and
(
other.getScope() = attr.getScope() and
any(CfgNode n | n.asCfgNode() = other.getNode().(Cfg::CallNode).getFunction())
.getALocalSource() = attr
or
not exists(other.getScope().(Function).getDefinition()) and
other.getScope().getScope+() = attr.getScope()
) and
arg = attr.getObject() and
arg = call.getArgument(any(ArgumentPosition p | p.isSelf())) and
arg = other.getArgument(any(ArgumentPosition p | p.isSelf()))
)
or
// `f = getattr(obj, "foo"); f()` where `obj` is used as (pos self) argument for
// `f()` call
exists(DataFlowCall getAttrCall, DataFlowCall methodCall, AttrRead attr |
call in [getAttrCall, methodCall]
|
arg = getAttrCall.getArgument(any(ArgumentPosition p | p.isPositional(0))) and
arg = methodCall.getArgument(any(ArgumentPosition p | p.isSelf())) and
attr.getObject() = arg and
attr.(CfgNode).getNode() = getAttrCall.getNode()
)
or
// In the code `super(Base, self).foo()` we use `self` as an argument in both the
// super() call (pos 1) and in the .foo() call (pos self).
exists(DataFlowCall superCall, DataFlowCall methodCall | call in [superCall, methodCall] |
exists(superCallTwoArgumentTracker(_, arg)) and
arg = superCall.getArgument(any(ArgumentPosition p | p.isPositional(1))) and
arg = methodCall.getArgument(any(ArgumentPosition p | p.isSelf()))
)
or
// in the code `def func(self): super().foo(); super.bar()` we use `self` as the
// (pos self) argument in both .foo() and .bar() calls.
exists(Function f, DataFlowCall other | other != call |
exprNode(f.getArg(0)) = arg and
call.getNode().getScope() = f and
arg = call.getArgument(any(ArgumentPosition p | p.isSelf())) and
arg = other.getArgument(any(ArgumentPosition p | p.isSelf())) and
other.getNode().getScope() = f
)
}
predicate missingArgumentCallExclude(ArgumentNode arg) {
// We overapproximate the argument nodes in order to not rely on the global `getCallArg`
// predicate.
// Because of this, we must exclude the cases where we have an approximation but no actual
// argument node.
arg = getCallArgApproximation() and not getCallArg(_, _, _, arg, _)
or
// Likewise, capturing closure arguments do not have corresponding argument nodes in some cases.
arg instanceof SynthCapturedVariablesArgumentNode and
not arg.argumentOf(_, _)
}
}
import MakeConsistency<Location, PythonDataFlow, PythonTaintTracking, Input>